Importing necessary libraries and modules
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import webbrowser
import timeit
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV,train_test_split, cross_val_score
from scipy.stats import skew, norm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestRegressor
import graphviz
import xgboost as xgb
import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Dense,Conv2D, MaxPooling2D, Flatten, Activation
from keras.models import Sequential
from keras.regularizers import l1
from keras.wrappers.scikit_learn import KerasRegressor
from keras_visualizer import visualizer
C:\Users\david\anaconda3\envs\tf-gpu_T\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. from pandas import MultiIndex, Int64Index
There were some warning messages that repeated constantly later in the code so, though I'm aware of the issues, I ignore these warnings here.
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
Setting up display options for pandas
pd.set_option('display.max_row', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
Loading and checking over dataset
amesH_data = pd.read_excel('data/AmesHousing.xls')
amesH_data.head(10)
| Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 526301100 | 20 | RL | 141.00000 | 31770 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 5 | 1960 | 1960 | Hip | CompShg | BrkFace | Plywood | Stone | 112.00000 | TA | TA | CBlock | TA | Gd | Gd | BLQ | 639.00000 | Unf | 0.00000 | 441.00000 | 1080.00000 | GasA | Fa | Y | SBrkr | 1656 | 0 | 0 | 1656 | 1.00000 | 0.00000 | 1 | 0 | 3 | 1 | TA | 7 | Typ | 2 | Gd | Attchd | 1960.00000 | Fin | 2.00000 | 528.00000 | TA | TA | P | 210 | 62 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2010 | WD | Normal | 215000 |
| 1 | 2 | 526350040 | 20 | RH | 80.00000 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.00000 | TA | TA | CBlock | TA | TA | No | Rec | 468.00000 | LwQ | 144.00000 | 270.00000 | 882.00000 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.00000 | 0.00000 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.00000 | Unf | 1.00000 | 730.00000 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal | 105000 |
| 2 | 3 | 526351010 | 20 | RL | 81.00000 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.00000 | TA | TA | CBlock | TA | TA | No | ALQ | 923.00000 | Unf | 0.00000 | 406.00000 | 1329.00000 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.00000 | 0.00000 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.00000 | Unf | 1.00000 | 312.00000 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal | 172000 |
| 3 | 4 | 526353030 | 20 | RL | 93.00000 | 11160 | Pave | NaN | Reg | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1968 | 1968 | Hip | CompShg | BrkFace | BrkFace | None | 0.00000 | Gd | TA | CBlock | TA | TA | No | ALQ | 1065.00000 | Unf | 0.00000 | 1045.00000 | 2110.00000 | GasA | Ex | Y | SBrkr | 2110 | 0 | 0 | 2110 | 1.00000 | 0.00000 | 2 | 1 | 3 | 1 | Ex | 8 | Typ | 2 | TA | Attchd | 1968.00000 | Fin | 2.00000 | 522.00000 | TA | TA | Y | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 244000 |
| 4 | 5 | 527105010 | 60 | RL | 74.00000 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.00000 | TA | TA | PConc | Gd | TA | No | GLQ | 791.00000 | Unf | 0.00000 | 137.00000 | 928.00000 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.00000 | 0.00000 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.00000 | Fin | 2.00000 | 482.00000 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal | 189900 |
| 5 | 6 | 527105030 | 60 | RL | 78.00000 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 6 | 1998 | 1998 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 20.00000 | TA | TA | PConc | TA | TA | No | GLQ | 602.00000 | Unf | 0.00000 | 324.00000 | 926.00000 | GasA | Ex | Y | SBrkr | 926 | 678 | 0 | 1604 | 0.00000 | 0.00000 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Attchd | 1998.00000 | Fin | 2.00000 | 470.00000 | TA | TA | Y | 360 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal | 195500 |
| 6 | 7 | 527127150 | 120 | RL | 41.00000 | 4920 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 2001 | 2001 | Gable | CompShg | CemntBd | CmentBd | None | 0.00000 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 616.00000 | Unf | 0.00000 | 722.00000 | 1338.00000 | GasA | Ex | Y | SBrkr | 1338 | 0 | 0 | 1338 | 1.00000 | 0.00000 | 2 | 0 | 2 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 2001.00000 | Fin | 2.00000 | 582.00000 | TA | TA | Y | 0 | 0 | 170 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 4 | 2010 | WD | Normal | 213500 |
| 7 | 8 | 527145080 | 120 | RL | 43.00000 | 5005 | Pave | NaN | IR1 | HLS | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | HdBoard | None | 0.00000 | Gd | TA | PConc | Gd | TA | No | ALQ | 263.00000 | Unf | 0.00000 | 1017.00000 | 1280.00000 | GasA | Ex | Y | SBrkr | 1280 | 0 | 0 | 1280 | 0.00000 | 0.00000 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 1992.00000 | RFn | 2.00000 | 506.00000 | TA | TA | Y | 0 | 82 | 0 | 0 | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal | 191500 |
| 8 | 9 | 527146030 | 120 | RL | 39.00000 | 5389 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1995 | 1996 | Gable | CompShg | CemntBd | CmentBd | None | 0.00000 | Gd | TA | PConc | Gd | TA | No | GLQ | 1180.00000 | Unf | 0.00000 | 415.00000 | 1595.00000 | GasA | Ex | Y | SBrkr | 1616 | 0 | 0 | 1616 | 1.00000 | 0.00000 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 1 | TA | Attchd | 1995.00000 | RFn | 2.00000 | 608.00000 | TA | TA | Y | 237 | 152 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 3 | 2010 | WD | Normal | 236500 |
| 9 | 10 | 527162130 | 60 | RL | 60.00000 | 7500 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1999 | 1999 | Gable | CompShg | VinylSd | VinylSd | None | 0.00000 | TA | TA | PConc | TA | TA | No | Unf | 0.00000 | Unf | 0.00000 | 994.00000 | 994.00000 | GasA | Gd | Y | SBrkr | 1028 | 776 | 0 | 1804 | 0.00000 | 0.00000 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | TA | Attchd | 1999.00000 | Fin | 2.00000 | 442.00000 | TA | TA | Y | 140 | 60 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal | 189000 |
amesH_data.shape
(2930, 82)
amesH_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2930 entries, 0 to 2929 Data columns (total 82 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order 2930 non-null int64 1 PID 2930 non-null int64 2 MS SubClass 2930 non-null int64 3 MS Zoning 2930 non-null object 4 Lot Frontage 2440 non-null float64 5 Lot Area 2930 non-null int64 6 Street 2930 non-null object 7 Alley 198 non-null object 8 Lot Shape 2930 non-null object 9 Land Contour 2930 non-null object 10 Utilities 2930 non-null object 11 Lot Config 2930 non-null object 12 Land Slope 2930 non-null object 13 Neighborhood 2930 non-null object 14 Condition 1 2930 non-null object 15 Condition 2 2930 non-null object 16 Bldg Type 2930 non-null object 17 House Style 2930 non-null object 18 Overall Qual 2930 non-null int64 19 Overall Cond 2930 non-null int64 20 Year Built 2930 non-null int64 21 Year Remod/Add 2930 non-null int64 22 Roof Style 2930 non-null object 23 Roof Matl 2930 non-null object 24 Exterior 1st 2930 non-null object 25 Exterior 2nd 2930 non-null object 26 Mas Vnr Type 2907 non-null object 27 Mas Vnr Area 2907 non-null float64 28 Exter Qual 2930 non-null object 29 Exter Cond 2930 non-null object 30 Foundation 2930 non-null object 31 Bsmt Qual 2850 non-null object 32 Bsmt Cond 2850 non-null object 33 Bsmt Exposure 2847 non-null object 34 BsmtFin Type 1 2850 non-null object 35 BsmtFin SF 1 2929 non-null float64 36 BsmtFin Type 2 2849 non-null object 37 BsmtFin SF 2 2929 non-null float64 38 Bsmt Unf SF 2929 non-null float64 39 Total Bsmt SF 2929 non-null float64 40 Heating 2930 non-null object 41 Heating QC 2930 non-null object 42 Central Air 2930 non-null object 43 Electrical 2929 non-null object 44 1st Flr SF 2930 non-null int64 45 2nd Flr SF 2930 non-null int64 46 Low Qual Fin SF 2930 non-null int64 47 Gr Liv Area 2930 non-null int64 48 Bsmt Full Bath 2928 non-null float64 49 Bsmt Half Bath 2928 non-null float64 50 Full Bath 2930 non-null int64 51 Half Bath 2930 non-null int64 52 Bedroom AbvGr 2930 non-null int64 53 Kitchen AbvGr 2930 non-null int64 54 Kitchen Qual 2930 non-null object 55 TotRms AbvGrd 2930 non-null int64 56 Functional 2930 non-null object 57 Fireplaces 2930 non-null int64 58 Fireplace Qu 1508 non-null object 59 Garage Type 2773 non-null object 60 Garage Yr Blt 2771 non-null float64 61 Garage Finish 2771 non-null object 62 Garage Cars 2929 non-null float64 63 Garage Area 2929 non-null float64 64 Garage Qual 2771 non-null object 65 Garage Cond 2771 non-null object 66 Paved Drive 2930 non-null object 67 Wood Deck SF 2930 non-null int64 68 Open Porch SF 2930 non-null int64 69 Enclosed Porch 2930 non-null int64 70 3Ssn Porch 2930 non-null int64 71 Screen Porch 2930 non-null int64 72 Pool Area 2930 non-null int64 73 Pool QC 13 non-null object 74 Fence 572 non-null object 75 Misc Feature 106 non-null object 76 Misc Val 2930 non-null int64 77 Mo Sold 2930 non-null int64 78 Yr Sold 2930 non-null int64 79 Sale Type 2930 non-null object 80 Sale Condition 2930 non-null object 81 SalePrice 2930 non-null int64 dtypes: float64(11), int64(28), object(43) memory usage: 1.8+ MB
Checking for Duplicate Values
amesH_data[amesH_data.duplicated(keep='last')]
| Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice |
|---|
Setting the target variable
target = amesH_data['SalePrice']
Basic Summary Statistics for Target
T_Summary = amesH_data['SalePrice'].describe()
T_mode = amesH_data['SalePrice'].mode()
T_Summary.loc['mode'] = T_mode.iloc[0]
T_Summary.loc['skew']=amesH_data['SalePrice'].skew()
T_Summary.loc['kurt'] = amesH_data['SalePrice'].kurt()
T_Summary.loc['Na'] = amesH_data['SalePrice'].isna().sum()
T_Summary.loc['dtype']= amesH_data['SalePrice'].dtypes
T_Summary
count 2930.00000 mean 180796.06007 std 79886.69236 min 12789.00000 25% 129500.00000 50% 160000.00000 75% 213500.00000 max 755000.00000 mode 135000.00000 skew 1.74350 kurt 5.11890 Na 0.00000 dtype int64 Name: SalePrice, dtype: object
Creating indexes for discrete, continuous, ordinal, and nominal features while displaying the number of each type of feature.
discFeat = ['Year Built', 'Year Remod/Add', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces','Garage Yr Blt', 'Garage Cars', 'Yr Sold']
len(discFeat)
13
contFeat = ['Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch', 'Screen Porch', 'Pool Area', 'Misc Val']
len(contFeat)
19
ordF = ['Street', 'Alley','Lot Shape', 'Utilities', 'Land Slope', 'Overall Qual', 'Overall Cond', 'Exter Qual', 'Exter Cond', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Heating QC', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Finish', 'Garage Qual', 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence']
len(ordF)
25
nomF = ['MS SubClass', 'MS Zoning', 'Land Contour', 'Lot Config', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Foundation', 'Heating', 'Central Air', 'Garage Type', 'Misc Feature', 'Mo Sold', 'Sale Type', 'Sale Condition']
len(nomF)
22
discFeat_Summary = amesH_data[discFeat].describe()
d_mode = amesH_data[discFeat].mode()
discFeat_Summary.loc['mode'] = d_mode.iloc[0]
discFeat_Summary.loc['skew']=amesH_data[discFeat].skew()
discFeat_Summary.loc['kurt'] =amesH_data[discFeat].kurt()
discFeat_Summary.loc['Na'] = amesH_data[discFeat].isna().sum()
discFeat_Summary.loc['dtype'] = amesH_data[discFeat].dtypes
discFeat_Summary
| Year Built | Year Remod/Add | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | TotRms AbvGrd | Fireplaces | Garage Yr Blt | Garage Cars | Yr Sold | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2930.00000 | 2930.00000 | 2928.00000 | 2928.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2771.00000 | 2929.00000 | 2930.00000 |
| mean | 1971.35631 | 1984.26655 | 0.43135 | 0.06113 | 1.56655 | 0.37952 | 2.85427 | 1.04437 | 6.44300 | 0.59932 | 1978.13244 | 1.76681 | 2007.79044 |
| std | 30.24536 | 20.86029 | 0.52482 | 0.24525 | 0.55294 | 0.50263 | 0.82773 | 0.21408 | 1.57296 | 0.64792 | 25.52841 | 0.76057 | 1.31661 |
| min | 1872.00000 | 1950.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 2.00000 | 0.00000 | 1895.00000 | 0.00000 | 2006.00000 |
| 25% | 1954.00000 | 1965.00000 | 0.00000 | 0.00000 | 1.00000 | 0.00000 | 2.00000 | 1.00000 | 5.00000 | 0.00000 | 1960.00000 | 1.00000 | 2007.00000 |
| 50% | 1973.00000 | 1993.00000 | 0.00000 | 0.00000 | 2.00000 | 0.00000 | 3.00000 | 1.00000 | 6.00000 | 1.00000 | 1979.00000 | 2.00000 | 2008.00000 |
| 75% | 2001.00000 | 2004.00000 | 1.00000 | 0.00000 | 2.00000 | 1.00000 | 3.00000 | 1.00000 | 7.00000 | 1.00000 | 2002.00000 | 2.00000 | 2009.00000 |
| max | 2010.00000 | 2010.00000 | 3.00000 | 2.00000 | 4.00000 | 2.00000 | 8.00000 | 3.00000 | 15.00000 | 4.00000 | 2207.00000 | 5.00000 | 2010.00000 |
| mode | 2005.00000 | 1950.00000 | 0.00000 | 0.00000 | 2.00000 | 0.00000 | 3.00000 | 1.00000 | 6.00000 | 0.00000 | 2005.00000 | 2.00000 | 2007.00000 |
| skew | -0.60446 | -0.45186 | 0.61664 | 3.94080 | 0.17195 | 0.69771 | 0.30569 | 4.31382 | 0.75354 | 0.73922 | -0.38467 | -0.21984 | 0.13486 |
| kurt | -0.50172 | -1.34152 | -0.74799 | 14.92174 | -0.54144 | -1.03034 | 1.89142 | 19.86974 | 1.15459 | 0.10151 | 1.82658 | 0.24497 | -1.15758 |
| Na | 0.00000 | 0.00000 | 2.00000 | 2.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 159.00000 | 1.00000 | 0.00000 |
| dtype | int64 | int64 | float64 | float64 | int64 | int64 | int64 | int64 | int64 | int64 | float64 | float64 | int64 |
A garage built in 2207? Either there is some time travelling going on or this is a mistake.
amesH_data[amesH_data['Garage Yr Blt'] > 2011].loc[:, 'Garage Yr Blt':]
| Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2260 | 2207.00000 | RFn | 2.00000 | 502.00000 | TA | TA | Y | 132 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2007 | New | Partial | 267300 |
In the case of categories like Basement Full Bath and Garage Cars, we would expect that a NaN value indicates the absence of that particular feature. As such, we fill the missing values with 0.
amesH_data.loc[:, ['Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Yr Blt', 'Garage Cars']] = amesH_data.loc[:, ['Bsmt Full Bath', 'Bsmt Half Bath', 'Garage Yr Blt', 'Garage Cars']].fillna(0)
The one exception is for Garage Year Built where a zero value wouldn't make sense. Though it probably isn't a perfect solution I decided to simply fill Na values with Year Built.
amesH_data.loc[amesH_data['Garage Yr Blt'] == 0, 'Garage Yr Blt'] = amesH_data['Year Built']
contFeat_Summary = amesH_data[contFeat].describe()
c_mode = amesH_data[contFeat].mode()
contFeat_Summary.loc['mode'] = c_mode.iloc[0]
contFeat_Summary.loc['skew'] = amesH_data[contFeat].skew()
contFeat_Summary.loc['kurt'] = amesH_data[contFeat].skew()
contFeat_Summary.loc['Na'] = amesH_data[contFeat].isna().sum()
contFeat_Summary.loc['dtype'] = amesH_data[contFeat].dtypes
contFeat_Summary
| Lot Frontage | Lot Area | Mas Vnr Area | BsmtFin SF 1 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Garage Area | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Misc Val | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2440.00000 | 2930.00000 | 2907.00000 | 2929.00000 | 2929.00000 | 2929.00000 | 2929.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2929.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 | 2930.00000 |
| mean | 69.22459 | 10147.92184 | 101.89680 | 442.62957 | 49.72243 | 559.26255 | 1051.61454 | 1159.55768 | 335.45597 | 4.67679 | 1499.69044 | 472.81973 | 93.75188 | 47.53345 | 23.01160 | 2.59249 | 16.00205 | 2.24334 | 50.63515 |
| std | 23.36533 | 7880.01776 | 179.11261 | 455.59084 | 169.16848 | 439.49415 | 440.61507 | 391.89089 | 428.39572 | 46.31051 | 505.50889 | 215.04655 | 126.36156 | 67.48340 | 64.13906 | 25.14133 | 56.08737 | 35.59718 | 566.34429 |
| min | 21.00000 | 1300.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 334.00000 | 0.00000 | 0.00000 | 334.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| 25% | 58.00000 | 7440.25000 | 0.00000 | 0.00000 | 0.00000 | 219.00000 | 793.00000 | 876.25000 | 0.00000 | 0.00000 | 1126.00000 | 320.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| 50% | 68.00000 | 9436.50000 | 0.00000 | 370.00000 | 0.00000 | 466.00000 | 990.00000 | 1084.00000 | 0.00000 | 0.00000 | 1442.00000 | 480.00000 | 0.00000 | 27.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| 75% | 80.00000 | 11555.25000 | 164.00000 | 734.00000 | 0.00000 | 802.00000 | 1302.00000 | 1384.00000 | 703.75000 | 0.00000 | 1742.75000 | 576.00000 | 168.00000 | 70.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| max | 313.00000 | 215245.00000 | 1600.00000 | 5644.00000 | 1526.00000 | 2336.00000 | 6110.00000 | 5095.00000 | 2065.00000 | 1064.00000 | 5642.00000 | 1488.00000 | 1424.00000 | 742.00000 | 1012.00000 | 508.00000 | 576.00000 | 800.00000 | 17000.00000 |
| mode | 60.00000 | 9600.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 864.00000 | 0.00000 | 0.00000 | 864.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| skew | 1.49907 | 12.82090 | 2.60698 | 1.41618 | 4.13998 | 0.92305 | 1.15620 | 1.46943 | 0.86646 | 12.11816 | 1.27411 | 0.24199 | 1.84268 | 2.53539 | 4.01445 | 11.40379 | 3.95747 | 16.93914 | 21.99979 |
| kurt | 1.49907 | 12.82090 | 2.60698 | 1.41618 | 4.13998 | 0.92305 | 1.15620 | 1.46943 | 0.86646 | 12.11816 | 1.27411 | 0.24199 | 1.84268 | 2.53539 | 4.01445 | 11.40379 | 3.95747 | 16.93914 | 21.99979 |
| Na | 490.00000 | 0.00000 | 23.00000 | 1.00000 | 1.00000 | 1.00000 | 1.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 |
| dtype | float64 | int64 | float64 | float64 | float64 | float64 | float64 | int64 | int64 | int64 | int64 | float64 | int64 | int64 | int64 | int64 | int64 | int64 | int64 |
Similar to discrete variables, NaN values for all of the continuous features here likely indicates an abscence of that particular feature. For example, Na values for Total Bsmt Square Footage or Masonry Veneer indicates that the house does not have any basement or veneer. Again, I fill these na values with 0.
len(amesH_data[amesH_data['Lot Frontage'].isna()])
490
amesH_data.loc[:, ['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Garage Area']] = amesH_data.loc[:, ['Mas Vnr Area', 'BsmtFin SF 1', 'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', 'Garage Area']].fillna(0)
There is something curious though about the large number of Na values for Lot Frontage. This seemed strange to me so I checked a random number of records with NaN values for Lot Frontage using the assessor records that are available via the City of Ames website.
frontageDisc = amesH_data[amesH_data['Lot Frontage'].isna()]
len(frontageDisc)
490
checkFrontage = frontageDisc.sample(100, random_state=0)
# for x in checkFrontage.iloc[:,1]:
# webbrowser.open('https://beacon.schneidercorp.com/Application.aspx?AppID=165&LayerID=2145&PageTypeID=4&PageID=1108&Q=1381657885&KeyValue=0' + str(x))
After checking the assessor records(overhead photos), all of the houses in my sample clearly have lot frontage on a street. Some have irregular shapes such as crescent lots but most of these still have some square footage numbers for frontage. In other respects, the assessor records seem to match those in the dataset. This seems to be an error in data collection.
To resolve this in the present context, I will later use a KNN imputer for lot frontage to fill formerly Na values.
I create a quick function that displays all the value counts for a group of features.
def valCount(x):
for col in amesH_data[x].columns:
count = amesH_data[col].value_counts(dropna=False)
print(col, ' \n', count)
print('===================================')
valCount(ordF)
Street Pave 2918 Grvl 12 Name: Street, dtype: int64 =================================== Alley NaN 2732 Grvl 120 Pave 78 Name: Alley, dtype: int64 =================================== Lot Shape Reg 1859 IR1 979 IR2 76 IR3 16 Name: Lot Shape, dtype: int64 =================================== Utilities AllPub 2927 NoSewr 2 NoSeWa 1 Name: Utilities, dtype: int64 =================================== Land Slope Gtl 2789 Mod 125 Sev 16 Name: Land Slope, dtype: int64 =================================== Overall Qual 5 825 6 732 7 602 8 350 4 226 9 107 3 40 10 31 2 13 1 4 Name: Overall Qual, dtype: int64 =================================== Overall Cond 5 1654 6 533 7 390 8 144 4 101 3 50 9 41 2 10 1 7 Name: Overall Cond, dtype: int64 =================================== Exter Qual TA 1799 Gd 989 Ex 107 Fa 35 Name: Exter Qual, dtype: int64 =================================== Exter Cond TA 2549 Gd 299 Fa 67 Ex 12 Po 3 Name: Exter Cond, dtype: int64 =================================== Bsmt Qual TA 1283 Gd 1219 Ex 258 Fa 88 NaN 80 Po 2 Name: Bsmt Qual, dtype: int64 =================================== Bsmt Cond TA 2616 Gd 122 Fa 104 NaN 80 Po 5 Ex 3 Name: Bsmt Cond, dtype: int64 =================================== Bsmt Exposure No 1906 Av 418 Gd 284 Mn 239 NaN 83 Name: Bsmt Exposure, dtype: int64 =================================== BsmtFin Type 1 GLQ 859 Unf 851 ALQ 429 Rec 288 BLQ 269 LwQ 154 NaN 80 Name: BsmtFin Type 1, dtype: int64 =================================== BsmtFin Type 2 Unf 2499 Rec 106 LwQ 89 NaN 81 BLQ 68 ALQ 53 GLQ 34 Name: BsmtFin Type 2, dtype: int64 =================================== Heating QC Ex 1495 TA 864 Gd 476 Fa 92 Po 3 Name: Heating QC, dtype: int64 =================================== Electrical SBrkr 2682 FuseA 188 FuseF 50 FuseP 8 NaN 1 Mix 1 Name: Electrical, dtype: int64 =================================== Kitchen Qual TA 1494 Gd 1160 Ex 205 Fa 70 Po 1 Name: Kitchen Qual, dtype: int64 =================================== Functional Typ 2728 Min2 70 Min1 65 Mod 35 Maj1 19 Maj2 9 Sev 2 Sal 2 Name: Functional, dtype: int64 =================================== Fireplace Qu NaN 1422 Gd 744 TA 600 Fa 75 Po 46 Ex 43 Name: Fireplace Qu, dtype: int64 =================================== Garage Finish Unf 1231 RFn 812 Fin 728 NaN 159 Name: Garage Finish, dtype: int64 =================================== Garage Qual TA 2615 NaN 159 Fa 124 Gd 24 Po 5 Ex 3 Name: Garage Qual, dtype: int64 =================================== Garage Cond TA 2665 NaN 159 Fa 74 Gd 15 Po 14 Ex 3 Name: Garage Cond, dtype: int64 =================================== Paved Drive Y 2652 N 216 P 62 Name: Paved Drive, dtype: int64 =================================== Pool QC NaN 2917 Ex 4 Gd 4 TA 3 Fa 2 Name: Pool QC, dtype: int64 =================================== Fence NaN 2358 MnPrv 330 GdPrv 118 GdWo 112 MnWw 12 Name: Fence, dtype: int64 ===================================
For most of the ordinal features an NaN value indicates the absence of that feature so I fill the missing values with 'None'.
amesH_data.loc[:,['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Fireplace Qu','Garage Finish', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Fence']] = amesH_data.loc[:,['Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Fireplace Qu','Garage Finish', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Fence']].fillna('None')
There is one exception. One single record has a NaN value for Electrical. It is very unlikely that this means that the house has no electrical system. It is necessary then to check the record in question.
noElectricDisc = amesH_data[amesH_data['Electrical'].isna() == True]
noElectricDisc
| Order | PID | MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1577 | 1578 | 916386080 | 80 | RL | 73.00000 | 9735 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | Timber | Norm | Norm | 1Fam | SLvl | 5 | 5 | 2006 | 2007 | Gable | CompShg | VinylSd | VinylSd | None | 0.00000 | TA | TA | PConc | Gd | TA | No | Unf | 0.00000 | Unf | 0.00000 | 384.00000 | 384.00000 | GasA | Gd | Y | NaN | 754 | 640 | 0 | 1394 | 0.00000 | 0.00000 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 0 | None | BuiltIn | 2007.00000 | Fin | 2.00000 | 400.00000 | TA | TA | Y | 100 | 0 | 0 | 0 | 0 | 0 | None | None | NaN | 0 | 5 | 2008 | WD | Normal | 167500 |
Given that this particular house was built in 2006, it is almost certain that the house would have a standard breaker box. Just to check this though I will take a quick look at electrical system by year built.
amesH_data.groupby(['Year Built', 'Electrical'])['Electrical'].count().tail(50)
#amesH_data.groupby('Yr Sold')['SalePrice'].agg(np.median)
Year Built Electrical
1962 SBrkr 35
1963 SBrkr 35
1964 SBrkr 33
1965 FuseF 1
SBrkr 33
1966 SBrkr 35
1967 SBrkr 41
1968 SBrkr 45
1969 SBrkr 28
1970 SBrkr 42
1971 SBrkr 39
1972 SBrkr 40
1973 SBrkr 21
1974 SBrkr 23
1975 SBrkr 25
1976 SBrkr 54
1977 SBrkr 57
1978 SBrkr 42
1979 SBrkr 21
1980 SBrkr 27
1981 SBrkr 10
1982 SBrkr 7
1983 SBrkr 8
1984 SBrkr 19
1985 SBrkr 7
1986 SBrkr 11
1987 SBrkr 8
1988 SBrkr 15
1989 SBrkr 8
1990 SBrkr 19
1991 SBrkr 12
1992 SBrkr 27
1993 SBrkr 40
1994 SBrkr 37
1995 SBrkr 31
1996 SBrkr 34
1997 SBrkr 35
1998 SBrkr 47
1999 SBrkr 52
2000 SBrkr 48
2001 SBrkr 35
2002 SBrkr 47
2003 SBrkr 88
2004 SBrkr 99
2005 SBrkr 142
2006 SBrkr 137
2007 SBrkr 109
2008 SBrkr 49
2009 SBrkr 25
2010 SBrkr 3
Name: Electrical, dtype: int64
As I thought, the most recent house built without a standard breaker was built in 1965. It is safe then to assume that our mystery house has a standard breaker.
amesH_data.loc[:,'Electrical']= amesH_data.loc[:,'Electrical'].fillna('SBrkr')
valCount(nomF)
MS SubClass 20 1079 60 575 50 287 120 192 30 139 160 129 70 128 80 118 90 109 190 61 85 48 75 23 45 18 180 17 40 6 150 1 Name: MS SubClass, dtype: int64 =================================== MS Zoning RL 2273 RM 462 FV 139 RH 27 C (all) 25 I (all) 2 A (agr) 2 Name: MS Zoning, dtype: int64 =================================== Land Contour Lvl 2633 HLS 120 Bnk 117 Low 60 Name: Land Contour, dtype: int64 =================================== Lot Config Inside 2140 Corner 511 CulDSac 180 FR2 85 FR3 14 Name: Lot Config, dtype: int64 =================================== Neighborhood NAmes 443 CollgCr 267 OldTown 239 Edwards 194 Somerst 182 NridgHt 166 Gilbert 165 Sawyer 151 NWAmes 131 SawyerW 125 Mitchel 114 BrkSide 108 Crawfor 103 IDOTRR 93 Timber 72 NoRidge 71 StoneBr 51 SWISU 48 ClearCr 44 MeadowV 37 BrDale 30 Blmngtn 28 Veenker 24 NPkVill 23 Blueste 10 Greens 8 GrnHill 2 Landmrk 1 Name: Neighborhood, dtype: int64 =================================== Condition 1 Norm 2522 Feedr 164 Artery 92 RRAn 50 PosN 39 RRAe 28 PosA 20 RRNn 9 RRNe 6 Name: Condition 1, dtype: int64 =================================== Condition 2 Norm 2900 Feedr 13 Artery 5 PosA 4 PosN 4 RRNn 2 RRAe 1 RRAn 1 Name: Condition 2, dtype: int64 =================================== Bldg Type 1Fam 2425 TwnhsE 233 Duplex 109 Twnhs 101 2fmCon 62 Name: Bldg Type, dtype: int64 =================================== House Style 1Story 1481 2Story 873 1.5Fin 314 SLvl 128 SFoyer 83 2.5Unf 24 1.5Unf 19 2.5Fin 8 Name: House Style, dtype: int64 =================================== Roof Style Gable 2321 Hip 551 Gambrel 22 Flat 20 Mansard 11 Shed 5 Name: Roof Style, dtype: int64 =================================== Roof Matl CompShg 2887 Tar&Grv 23 WdShake 9 WdShngl 7 Membran 1 ClyTile 1 Roll 1 Metal 1 Name: Roof Matl, dtype: int64 =================================== Exterior 1st VinylSd 1026 MetalSd 450 HdBoard 442 Wd Sdng 420 Plywood 221 CemntBd 126 BrkFace 88 WdShing 56 AsbShng 44 Stucco 43 BrkComm 6 AsphShn 2 CBlock 2 Stone 2 PreCast 1 ImStucc 1 Name: Exterior 1st, dtype: int64 =================================== Exterior 2nd VinylSd 1015 MetalSd 447 HdBoard 406 Wd Sdng 397 Plywood 274 CmentBd 126 Wd Shng 81 Stucco 47 BrkFace 47 AsbShng 38 Brk Cmn 22 ImStucc 15 Stone 6 AsphShn 4 CBlock 3 PreCast 1 Other 1 Name: Exterior 2nd, dtype: int64 =================================== Mas Vnr Type None 1752 BrkFace 880 Stone 249 BrkCmn 25 NaN 23 CBlock 1 Name: Mas Vnr Type, dtype: int64 =================================== Foundation PConc 1310 CBlock 1244 BrkTil 311 Slab 49 Stone 11 Wood 5 Name: Foundation, dtype: int64 =================================== Heating GasA 2885 GasW 27 Grav 9 Wall 6 OthW 2 Floor 1 Name: Heating, dtype: int64 =================================== Central Air Y 2734 N 196 Name: Central Air, dtype: int64 =================================== Garage Type Attchd 1731 Detchd 782 BuiltIn 186 NaN 157 Basment 36 2Types 23 CarPort 15 Name: Garage Type, dtype: int64 =================================== Misc Feature NaN 2824 Shed 95 Gar2 5 Othr 4 Elev 1 TenC 1 Name: Misc Feature, dtype: int64 =================================== Mo Sold 6 505 7 449 5 395 4 279 8 233 3 232 10 173 9 161 11 143 2 133 1 123 12 104 Name: Mo Sold, dtype: int64 =================================== Sale Type WD 2536 New 239 COD 87 ConLD 26 CWD 12 ConLI 9 ConLw 8 Oth 7 Con 5 VWD 1 Name: Sale Type, dtype: int64 =================================== Sale Condition Normal 2413 Partial 245 Abnorml 190 Family 46 Alloca 24 AdjLand 12 Name: Sale Condition, dtype: int64 ===================================
I fill NaN nominal values with 'None' as it is likely that these represent an absence of a feature such as a garage or alley access.
amesH_data.loc[:, ['Alley', 'Mas Vnr Type', 'Garage Type', 'Misc Feature']] = amesH_data.loc[:, ['Alley', 'Mas Vnr Type', 'Garage Type', 'Misc Feature']].fillna('None')
Again I came across a discrepancy. There are several houses that have 'None' for Mas Vnr Type yet there is a non-zero value for Mas Vnr Area.
inv_MSVNR = amesH_data[(amesH_data['Mas Vnr Type'] == 'None') & (amesH_data['Mas Vnr Area'] > 0)]
inv_MSVNR.loc[:,['Order', 'PID', 'Bldg Type', 'Year Built', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area']]
| Order | PID | Bldg Type | Year Built | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | |
|---|---|---|---|---|---|---|---|---|
| 363 | 364 | 527166010 | 1Fam | 1999 | VinylSd | VinylSd | None | 344.00000 |
| 403 | 404 | 527451110 | TwnhsE | 1970 | HdBoard | HdBoard | None | 312.00000 |
| 441 | 442 | 528138010 | 1Fam | 2008 | VinylSd | VinylSd | None | 285.00000 |
| 1861 | 1862 | 533352075 | Duplex | 1977 | Plywood | Plywood | None | 1.00000 |
| 1913 | 1914 | 535106140 | 1Fam | 1958 | Wd Sdng | Wd Sdng | None | 1.00000 |
| 2003 | 2004 | 902427140 | 1Fam | 1956 | MetalSd | MetalSd | None | 1.00000 |
| 2528 | 2529 | 534129230 | 1Fam | 1972 | VinylSd | VinylSd | None | 288.00000 |
len(inv_MSVNR)
7
Since there are only 5 records, I check the Ames City Assessor records again and examine photos of each of the houses and then fill im the appropriate Mass Vnr Type or alternatively set the Mas Vnr Area to 0.
# Note I commented out the code below that accesses the relevant Ames city assessor records
# for x in inv_MSVNR.iloc[:,1]:
# webbrowser.open('https://beacon.schneidercorp.com/Application.aspx?AppID=165&LayerID=2145&PageTypeID=4&PageID=1108&Q=1381657885&KeyValue=0' + str(x))
amesH_data.iloc[363,26] = 'BrkFace'
amesH_data.iloc[441,26] = 'BrkFace'
amesH_data.iloc[2528,26] = 'BrkFace'
amesH_data.iloc[403,27] = 0
amesH_data.iloc[1861,27] = 0
amesH_data.iloc[1913,27] = 0
amesH_data.iloc[2003,27] = 0
Note:Now Taken care of @ Discrete Features section - all Garage Yr Built Na filled with Year Built There was one other discrepency that I came across. There is one record which has a Garage Type value and a non-zero Garage Area value but 0 for Garage Yr Blt.
# Now Taken care of @ Discrete Features section - all Garage Yr Built Na filled with Year Built
# amesH_data[(amesH_data['Garage Yr Blt'] == 0) & (amesH_data['Garage Area']!=0)]
amesH_data.drop(['Order', 'PID'], axis=1, inplace=True)
amesH_data = amesH_data.replace({'Street':{'Grvl':1, 'Pave':2},
'Alley':{'None':0, 'Grvl': 1, 'Pave': 2},
'Lot Shape':{'IR3':1, 'IR2':2, 'IR1':3, 'Reg':4},
'Utilities':{'ELO':1, 'NoSeWa':2, 'NoSewr':3, 'AllPub':4},
'Land Slope':{'Sev':1, 'Mod':2, 'Gtl':3},
'Exter Cond':{'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Exter Qual':{'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Bsmt Qual':{'None':0, 'Po':1, 'Fa': 2, 'TA':3, 'Gd':4, 'Ex':5},
'Bsmt Cond': {'None':0, 'Po': 1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Bsmt Exposure': {'None': 0, 'No':0,'Mn':1, 'Av':2, 'Gd':3},
'BsmtFin Type 1':{'None':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
'BsmtFin Type 2':{'None':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6},
'Heating QC':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Electrical':{'Mix':0, 'FuseP':1, 'FuseF':2,'FuseA':3,'SBrkr':4},
'Kitchen Qual':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Functional':{'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8},
'Fireplace Qu':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Garage Finish':{'None':0, 'Unf':1, 'RFn':2, 'Fin':3},
'Garage Qual':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Garage Cond':{'None': 0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Paved Drive':{'N':0, 'P':1, 'Y':2},
'Pool QC':{'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5},
'Fence':{'None':0, 'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}
})
amesH_data = amesH_data.replace({'MS SubClass':{20: 'MSC20', 30:'MSC30', 40:'MSC40', 45: 'MSC45', 50:'MSC50', 60:'MSC60', 70:'MSC70', 75:'MSC75', 80:'MSC80', 85:'MSC85', 90:'MSC90', 120:'MSC120', 150:'MSC150', 160:'MSC160', 180:'MSC180', 190:'SC190'},
'Mo Sold':{1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
})
I originally thought that simplifying a number of the ordinal features so as to reduce levels with very few values would be a good idea. After doing some modelling, I decided to scrape this experiment.
# Create new features
# 1 * Simplification of exisitng features
# amesH_data['Overall Qual'].replace({
# 1:1,2:1,3:1, #bad
# 4:2, 5:2, 6:2, #average
# 7:3, 8:3, 9:3, 10:3 # good
# })
# amesH_data['Overall Cond'].replace({
# 1:1,2:1,3:1, #bad
# 4:2, 5:2, 6:2, #average
# 7:3, 8:3, 9:3, 10:3 # good
# })
# amesH_data['Pool QC'].replace({
# 1:1,2:1, #average
# 3:2, 4:2 #good
# })
# amesH_data['Garage Cond'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2 # goods
# })
# amesH_data['Garage Qual'].replace({
# 1:1, #bad
# 2:1, 3:1, #average
# 4:2, 5:2 #good
# })
# amesH_data['Fireplace Qu'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2
# })
# amesH_data['Functional'].replace({
# 1:1, 2:1, #bad
# 3:2, 4:2, # major
# 5:3, 6:3, 7:3, # minor
# 8:4 # typical
# })
# amesH_data['Kitchen Qual'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2 # good
# })
# amesH_data['Heating QC'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2 # good
# })
# amesH_data['BsmtFin Type 1'].replace({
# 1:1, # unfinished
# 2:1, 3:1, # rec room
# 4:2, 5:2, 6:2 # living quaters
# })
# amesH_data['BsmtFin Type 2'].replace({
# 1:1, # unfinished
# 2:1, 3:1, # rec room
# 4:2, 5:2, 6:2 # living quarters
# })
# amesH_data['Bsmt Cond'].replace({
# 1:1, #bad
# 2:1, 3:1, # average
# 4:2, 5:2 # good
# })
# amesH_data['Bsmt Qual'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2 # good
# })
# amesH_data['Exter Cond'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2
# })
# amesH_data['Exter Qual'].replace({
# 1:1, # bad
# 2:1, 3:1, # average
# 4:2, 5:2 # good
# })
These Na values are those for Lot Frontage they will be taken care of in the second data prep phase via the KNN imputer.
amesH_data.isna().values.sum()
490
plt.style.use('ggplot')
We can see here that there is a linear positive relationship between Greater Living Area and Sale Price.
fig, ax = plt.subplots(figsize=(11,7))
plt.scatter(amesH_data['Gr Liv Area'],amesH_data['SalePrice'], c='red', marker='.')
plt.title('Relationship Between Above Ground Living Area and Sale Price', fontsize=20)
plt.xlabel('Living Area Square Footage')
plt.ylabel('Sale Price (US Dollars)')
plt.show()
There are, however, some extreme outliers - specifically 3 houses with square footage over 4500.
amesH_data[(amesH_data['Gr Liv Area'] > 4500)]
| MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1498 | MSC60 | RL | 313.00000 | 63887 | 2 | 0 | 1 | Bnk | 4 | Corner | 3 | Edwards | Feedr | Norm | 1Fam | 2Story | 10 | 5 | 2008 | 2008 | Hip | ClyTile | Stucco | Stucco | Stone | 796.00000 | 5 | 3 | PConc | 5 | 3 | 3 | 6 | 5644.00000 | 1 | 0.00000 | 466.00000 | 6110.00000 | GasA | 5 | Y | 4 | 4692 | 950 | 0 | 5642 | 2.00000 | 0.00000 | 2 | 1 | 3 | 1 | 5 | 12 | 8 | 3 | 4 | Attchd | 2008.00000 | 3 | 2.00000 | 1418.00000 | 3 | 3 | 2 | 214 | 292 | 0 | 0 | 0 | 480 | 4 | 0 | None | 0 | Jan | 2008 | New | Partial | 160000 |
| 2180 | MSC20 | RL | 128.00000 | 39290 | 2 | 0 | 3 | Bnk | 4 | Inside | 3 | Edwards | Norm | Norm | 1Fam | 1Story | 10 | 5 | 2008 | 2009 | Hip | CompShg | CemntBd | CmentBd | Stone | 1224.00000 | 5 | 3 | PConc | 5 | 3 | 3 | 6 | 4010.00000 | 1 | 0.00000 | 1085.00000 | 5095.00000 | GasA | 5 | Y | 4 | 5095 | 0 | 0 | 5095 | 1.00000 | 1.00000 | 2 | 1 | 2 | 1 | 5 | 15 | 8 | 2 | 4 | Attchd | 2008.00000 | 3 | 3.00000 | 1154.00000 | 3 | 3 | 2 | 546 | 484 | 0 | 0 | 0 | 0 | 0 | 0 | Elev | 17000 | Oct | 2007 | New | Partial | 183850 |
| 2181 | MSC60 | RL | 130.00000 | 40094 | 2 | 0 | 3 | Bnk | 4 | Inside | 3 | Edwards | PosN | PosN | 1Fam | 2Story | 10 | 5 | 2007 | 2008 | Hip | CompShg | CemntBd | CmentBd | Stone | 762.00000 | 5 | 3 | PConc | 5 | 3 | 3 | 6 | 2260.00000 | 1 | 0.00000 | 878.00000 | 3138.00000 | GasA | 5 | Y | 4 | 3138 | 1538 | 0 | 4676 | 1.00000 | 0.00000 | 3 | 1 | 3 | 1 | 5 | 11 | 8 | 1 | 4 | BuiltIn | 2007.00000 | 3 | 3.00000 | 884.00000 | 3 | 3 | 2 | 208 | 406 | 0 | 0 | 0 | 0 | 0 | 0 | None | 0 | Oct | 2007 | New | Partial | 184750 |
There is also a linear relationship between Lot Area and Sale Price.
fig, ax = plt.subplots(figsize=(11,7))
plt.scatter(amesH_data['Lot Area'],amesH_data['SalePrice'], c='purple', marker='.')
plt.title('Relationship Lot Area and Sale Price', fontsize=20)
plt.xlabel('Lot Area Square Footage')
plt.ylabel('Sale Price (US Dollars)')
plt.show()
Again there are a number of outliers especially the 4 values greater than 100000
amesH_data[amesH_data['Lot Area']>100000]
| MS SubClass | MS Zoning | Lot Frontage | Lot Area | Street | Alley | Lot Shape | Land Contour | Utilities | Lot Config | Land Slope | Neighborhood | Condition 1 | Condition 2 | Bldg Type | House Style | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Roof Style | Roof Matl | Exterior 1st | Exterior 2nd | Mas Vnr Type | Mas Vnr Area | Exter Qual | Exter Cond | Foundation | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating | Heating QC | Central Air | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Type | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Feature | Misc Val | Mo Sold | Yr Sold | Sale Type | Sale Condition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 956 | MSC20 | RL | 150.00000 | 215245 | 2 | 0 | 1 | Low | 4 | Inside | 1 | Timber | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1965 | 1965 | Hip | CompShg | BrkFace | BrkFace | None | 0.00000 | 3 | 3 | CBlock | 4 | 3 | 3 | 5 | 1236.00000 | 3 | 820.00000 | 80.00000 | 2136.00000 | GasW | 3 | Y | 4 | 2036 | 0 | 0 | 2036 | 2.00000 | 0.00000 | 2 | 0 | 3 | 1 | 3 | 8 | 8 | 2 | 4 | Attchd | 1965.00000 | 2 | 2.00000 | 513.00000 | 3 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | None | 0 | Jun | 2009 | WD | Normal | 375000 |
| 1570 | SC190 | RL | NaN | 164660 | 1 | 0 | 3 | HLS | 4 | Corner | 1 | Timber | Norm | Norm | 2fmCon | 1.5Fin | 5 | 6 | 1965 | 1965 | Gable | CompShg | Plywood | Plywood | None | 0.00000 | 3 | 3 | CBlock | 3 | 3 | 3 | 5 | 1249.00000 | 4 | 147.00000 | 103.00000 | 1499.00000 | GasA | 5 | Y | 4 | 1619 | 167 | 0 | 1786 | 2.00000 | 0.00000 | 2 | 0 | 3 | 1 | 3 | 7 | 8 | 2 | 4 | Attchd | 1965.00000 | 3 | 2.00000 | 529.00000 | 3 | 3 | 2 | 670 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Shed | 700 | Aug | 2008 | WD | Normal | 228950 |
| 2071 | MSC20 | RL | NaN | 115149 | 2 | 0 | 2 | Low | 4 | CulDSac | 1 | ClearCr | Norm | Norm | 1Fam | 1Story | 7 | 5 | 1971 | 2002 | Gable | CompShg | Plywood | Plywood | Stone | 351.00000 | 3 | 3 | CBlock | 4 | 3 | 3 | 6 | 1219.00000 | 1 | 0.00000 | 424.00000 | 1643.00000 | GasA | 3 | Y | 4 | 1824 | 0 | 0 | 1824 | 1.00000 | 0.00000 | 2 | 0 | 2 | 1 | 4 | 5 | 8 | 2 | 3 | Attchd | 1971.00000 | 1 | 2.00000 | 739.00000 | 3 | 3 | 2 | 380 | 48 | 0 | 0 | 0 | 0 | 0 | 0 | None | 0 | Jun | 2007 | WD | Normal | 302000 |
| 2115 | MSC50 | RL | NaN | 159000 | 2 | 0 | 2 | Low | 4 | CulDSac | 1 | ClearCr | Norm | Norm | 1Fam | 1.5Fin | 6 | 7 | 1958 | 2006 | Gable | CompShg | Wd Sdng | HdBoard | BrkCmn | 472.00000 | 4 | 3 | CBlock | 4 | 3 | 3 | 3 | 697.00000 | 1 | 0.00000 | 747.00000 | 1444.00000 | GasA | 4 | Y | 4 | 1444 | 700 | 0 | 2144 | 0.00000 | 1.00000 | 2 | 0 | 4 | 1 | 4 | 7 | 8 | 2 | 3 | Attchd | 1958.00000 | 3 | 2.00000 | 389.00000 | 3 | 3 | 2 | 0 | 98 | 0 | 0 | 0 | 0 | 0 | 0 | Shed | 500 | Jun | 2007 | WD | Normal | 277000 |
There is a linear relationship for the most part between Total Basement Square Footage and Sale Price. It does not appear to be quite as tight and there are a lot of 0 values.
fig, ax = plt.subplots(figsize=(11,7))
plt.scatter(amesH_data['Total Bsmt SF'],amesH_data['SalePrice'], c='Blue', marker='.')
plt.title('Relationship Total Bsmt SF and Sale Price', fontsize=20)
plt.xlabel('Total Bsmt Square Footage')
plt.ylabel('Sale Price (US Dollars)')
plt.show()
This plot between Garage Area and Sale Price seems quite similar in shape to the scatter plot for Total Basement Square Footage and Sale Price but more spread out. It seems to be a fairly linear relationship with quite a few zero values.
fig, ax = plt.subplots(figsize=(11,7))
plt.scatter(amesH_data['Garage Area'],amesH_data['SalePrice'], c='Magenta', marker='.')
plt.title('Relationship Garage Area and Sale Price', fontsize=20)
plt.xlabel('Garage Square Footage')
plt.ylabel('Sale Price (US Dollars)')
plt.show()
In the last of the scatter plots, I wanted to see what plot would result if AboveGrade Living Area and Total Basement Square Footage were combined. The result is interesting as it shows a very strong linear relationship between Total Square Footage and Sale Price.
fig, ax = plt.subplots(figsize=(11,7))
plt.scatter((amesH_data['Gr Liv Area']+amesH_data['Total Bsmt SF']),amesH_data['SalePrice'], c='green', marker='.')
plt.title('Relationship Between Total Square Footage and Sale Price', fontsize=20)
plt.xlabel('Gr SF & Total Basement SF Square Footage')
plt.ylabel('Sale Price (US Dollars)')
plt.show()
First we take a look at the relationship between House Style and Sale Price. We can see that it is possible to purchase nearly any style of home for a price in a band of about $140,000 to $200,000 in Ames. The price band for one-story and two-story homes varies considerably starting under $50,0000 and going up to the mid-$300,000s. Interestingly, while 2.5 Finished and 2.5 Unfinished price bands are higher than that for 2Story homes, 2Story homes have a significantly higher ceiling. The remaining house styles have relatively tighter pricing bands.
var = 'House Style'
data = pd.concat([amesH_data['SalePrice'], amesH_data[var]], axis =1)
f, ax = plt.subplots(figsize=(11, 7))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000)
ax.set_title('Sale Price by House Style', fontsize=20)
plt.ylabel('Sale Price (US Dollars)')
plt.xticks([0,1,2,3,4,5,6,7], ['One Story', 'Two Story', '1.5 Story(Finished 2ndL)', 'Split Foyer', 'Split Level', '2.5 Story(Unfinished 2ndL)', '1.5 Story(Unfinished 2ndL', '2.5 Story(Finished 2ndL'], rotation=30)
plt.show()
Next we look at Neighborhood. Immediately it is apparent that Stone Brook, Northridge Heights, and Northridge are the priciest neighborhoods with median home prices over $300,000. Greenhills is only slightly cheaper. Otherwise, homes can be purchased in most neighborhoods within our price band of $150,000 to $200,000. Aside from the upper class areas, only the upper-middle class neighborhoods of Somerset,Timber,Veenker and College Crescent have median prices above the $200,000 mark. Cheaper neighborhoods are Briardale, Meadow Village, Brookside, Old Town, Edwards, and South West of Iowa State University. It makes sense that Iowa DOT and Railroad is the cheapest place to buy a house in Ames.
neighborhoods=['North Ames', 'Gilbert', 'Stone Brook', 'North West Ames', 'Somerset', 'Briardale', 'Northpark Villa', 'Northridge Heights', 'Bloomington Heights', 'Northridge', 'Sawyer West', 'Sawyer', 'Greens', 'Brookside', 'Old Town', 'Iowa DOT and Railroad','Clear Creek', 'SW of Iowa State U', 'Edwards', 'College Creek', 'Crawford', 'Bluestem', 'Mitchell', 'Timber', 'Meadow Village', 'Veenker', 'Green Hills', 'Landmark']
var = 'Neighborhood'
data = pd.concat([amesH_data['SalePrice'], amesH_data[var]], axis =1)
f, ax = plt.subplots(figsize=(20, 10))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000)
ax.set_title('Sale Price by Neighborhood', fontsize=20)
plt.ylabel('Sale Price (US Dollars)')
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27], neighborhoods, rotation=85)
plt.show()
This chart gives us a different perspective on Neighborhood by showing the average price by neighborhood and the frequency of homes along with a median line that allows us to compare the median home price in all of Ames.
var = 'Neighborhood'
counts = amesH_data['Neighborhood'].value_counts()#.rename_axis('neighborhood').reset_index(name='count')
c=counts.to_dict()
neighborhoods=[f'North Ames ({c["NAmes"]})', f'Gilbert ({c["Gilbert"]})', f'Stone Brook({c["StoneBr"]})', f'North West Ames({c["NWAmes"]})', f'Somerset({c["Somerst"]})', f'Briardale({c["BrDale"]})', f'Northpark Villa({c["NPkVill"]})', f'Northridge Heights({c["NridgHt"]})', f'Bloomington Heights({c["Blmngtn"]})', f'Northridge({c["NoRidge"]})', f'Sawyer West({c["SawyerW"]})', f'Sawyer({c["Sawyer"]})', f'Greens({c["Greens"]})', f'Brookside({c["BrkSide"]})', f'Old Town({c["OldTown"]})', f'Iowa DOT and Railroad',f'Clear Creek({c["IDOTRR"]})', f'SW of Iowa State U({c["SWISU"]})', f'Edwards({c["Edwards"]})', f'College Creek({c["CollgCr"]})', f'Crawford({c["Crawfor"]})', f'Bluestem({c["Blueste"]})', f'Mitchell({c["Mitchel"]})', f'Timber({c["Timber"]})', f'Meadow Village({c["MeadowV"]})', f'Veenker({c["Veenker"]})', f'Green Hills({c["GrnHill"]})', f'Landmark({c["Landmrk"]})']
data = pd.concat([amesH_data['SalePrice'], amesH_data[var]], axis =1)
f,ax= plt.subplots(figsize=(25,5))
fig = sns.barplot(var, 'SalePrice', data=data, ci=None)
ax.set_title('Average Price by Neighborhood(Frequencies) with Median Line ', fontsize=17)
#ax.bar_label(conatainers[0])
plt.ylabel('Sale Price (US Dollars)')
plt.axhline(amesH_data['SalePrice'].median(), color='magenta', linestyle='dashed')
plt.xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27], neighborhoods, rotation=45, fontsize=14)
plt.show()
We can see that while homes in Residential Low-Density have a wider price-band, they generally tend to have higher prices than Residential High-Density and Residential Medium-Density. The Floating Village zoning has the highest prices. (Note: Floating Village here refers to a type of planned community with various amenities built-in to the neighborhood)
Not surprisingly, homes that are in Commercial, Industrial or Agricultural zones tend to be cheaper.
var = 'MS Zoning'
data = pd.concat([amesH_data['SalePrice'], amesH_data[var]], axis =1)
f, ax = plt.subplots(figsize=(11,7))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000)
ax.set_title('Sale Price by Municipal Zone', fontsize=20)
plt.xlabel('Municipal Zones')
plt.ylabel('Sale Price (US Dollars)')
plt.xticks(ticks=[0,1,2,3,4,5,6], labels=['Residential Low Density','Residential High Density', 'Floating Village', 'Residential Medium Density', 'Commercial', 'Industrial', 'Agricultural'], rotation=30)
plt.show()
Here we can see the majority of homes sold in Ames between 2006 and 2010 were built since the 1960s with a considerable number built around the turn of the millennium.
ax = amesH_data.hist(column='Year Built', figsize=(11,7),color='blue')
ax = ax[0]
for x in ax:
x.set_title('Number of Homes Sold Between 2006-10 by Year Built', fontsize=20)
x.set_xlabel('Year Built')
x.set_ylabel('Number of Houses Sold')
This set of boxplots gives us a bit more detail in terms of the price of a home depending on year built. As is perhaps obvious, newer homes tend to be more expensive. It is interesting though that houses built in 1892 seem to fetch higher prices than we might expect.
var = 'Year Built'
data = pd.concat([amesH_data['SalePrice'], amesH_data[var]], axis =1)
f, ax = plt.subplots(figsize=(22, 12))
fig = sns.boxplot(x=var, y='SalePrice', data=data)
fig.axis(ymin=0, ymax=800000)
ax.set_title('Sale Price by Year Built', fontsize=20)
plt.ylabel('Sale Price (US Dollars)')
plt.xticks(rotation=90)
plt.show()
In this chart we can see that house prices actually tend to decline in Ames in the 2006-2010 period with a fall in prices since 2007 which corresponds to the beginning of the US Housing Finance Crisis.
avg_price = amesH_data.groupby('Yr Sold')['SalePrice'].agg(np.average)
med_price = amesH_data.groupby('Yr Sold')['SalePrice'].agg(np.median)
f, ax = plt.subplots(figsize=(11,7))
plt.plot(avg_price, label='Average Price')
plt.plot(med_price, label='Median Price')
plt.legend(loc='upper right')
plt.title('Sale Price by Year', fontsize=20)
plt.xlabel('Year')
plt.ylabel('Sale Price (US Dollars)')
plt.show()
This period is significantly different than the long-term housing price trend in Iowa. As we can see in this graph provided by the Federal Reserve.
from IPython.display import Image
Image('images/fredgraphIowa.png')
Looking at the frequency distribution of Sale Price we can see that our target variable has a significant right-skew.
plt.figure(figsize=(11,7))
sns.histplot(data=amesH_data['SalePrice'], kde=True).set_title('Sale Price Frequency', fontsize=20)
plt.show()
The probability plot confirms that SalePrice does not follow a normal distribution
from scipy.stats import probplot
probplot(amesH_data['SalePrice'], plot=plt)
plt.figure(figsize=(11,7))
plt.show()
<Figure size 792x504 with 0 Axes>
I attempted to perform a log transformation on both the target variable (Sale Price) and significantly skewed independent variables. However, this resulted in both warnings and exceptions elsewhere in the code. This might be because log(x) approaches negative infinity as the value of x becomes closer to zero. (see http://onbiostatistics.blogspot.com/2012/05/logx1-data-transformation.html)
As an alternative, I utilized a log(x+1) transformation. (see https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models)
This shifts the distribution to the right resulting in greater normality.
price = pd.DataFrame({'price':amesH_data['SalePrice'], 'log(price+1)':np.log1p(amesH_data['SalePrice'])})
price.hist(figsize=(15,5), color='blue')
plt.show()
Looking at our histograms of discrete and continuous variables, we can see that most of the continuous features are right-skewed. It would be good then to transform the most skewed of these variables using log(price+1) as we will do with our target variable.
There are a lot of discrete features (along with Mas Vnr Area) that are dominated by 0 values. This makes sense as in most cases homes do not have a pool or an enclosed porch. Nevertheless, it is probably doubtful that these zero dominated features will be important when developing our predictive models.
a_N = pd.concat([amesH_data[contFeat], amesH_data[discFeat], target], axis=1)
a_Ncorr = a_N.corr()
a_N.hist(bins=45, figsize=(17,15), color='blue')
plt.show()
I thought I would check a couple of individual histograms to see if any more detail emerges if we zoom in. We can see a bit more detail for these two features distributions but really the large number of zero values predominates.
price = pd.DataFrame({'Screen Porch':amesH_data['Screen Porch'], 'BsmtFin SF 2':amesH_data['BsmtFin SF 2']})
price.hist(figsize=(15,5), color='blue')
plt.show()
We begin by looking at an extremely large correlation matrix which includes all features in our data set.
While it is a bit chaotic, a few things stand out. First, there are some features which have a high level of collinearity. Most are fairly easy to understand: Positive: Pool Area & Pool Quality Garage Quality & Garage Condition Garage Area & Garage Cars Year Built & Year Garage Built Total Rooms Above Ground & Total Area Above Ground Basement Finish Type Square Footage & Basement Type Square Footage
Negative: Basement Unfinished Square Footage & Basement Full Bathrooms Basement Unifinished Square Footage & Basement Finish Type Square footage Year Built & Overall Condition
A bit more interesting is the close relationship between Total Basement Square Footage and First Floor Square Footage.
As well, there is a negative relationship between Enclosed Porch and Year Built.
amesCorr = amesH_data.corr()
f, ax = plt.subplots(figsize=(22,19))
cmap = sns.color_palette('Spectral', as_cmap=True)
#cmap = sns.diverging_palette(145,300, l=65, center='dark',as_cmap = True)
ax.set_title('Heat Map for All Features in Ames Housing Dataset', fontsize=20)
sns.heatmap(amesCorr, vmax=0.8, cmap=cmap)
plt.show()
Next we zoom in to look specifically at the 21 features with the highest level of correlation with our target variable - Sale Price.
We see that there are very strong correlations between Total Indoor Square Footage, Overall Quality, and AboveGround Living Area Square Footage.
n = 22
top_corr = amesCorr.nlargest(n, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(amesH_data[top_corr].values.T)
n2=10
ten_corr = amesCorr.nlargest(n2, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(amesH_data[top_corr].values.T)
f, ax = plt.subplots(figsize=(25,18))
#sns.set(font_scale=1.25)
mask = np.zeros_like(cm)
mask[np.triu_indices_from(mask)] = True
hm = sns.heatmap(cm, cbar=True, annot=True , square=True, fmt='.2f', annot_kws={'size':15}, yticklabels=top_corr.values, xticklabels=top_corr.values, mask=mask)
ax.set_title('Correlation Matrix Focused Top 23 Features Most Closely Correlated with Sale Price', fontsize=20)
plt.show()
Here we can again see problems of multicollinearity.
Particularly problematic are:
Positive
Garage Area and Garage Cars
Total Basement Square Footage and 1st Flr Square Footage
Total Rooms Above Ground and AboveGround Living Area
Negative Basement Unfinished Square Footage and Basement Finished Square Footage Basement Unfinshed Square Footage and Basement Full Bathrooms
We also see significant collinearity between Total Indoor Square Footage and AboveGround Living Area Square Footage, Total Basement Square Footage and 1st Floor Square Footage.
f, ax = plt.subplots(figsize=(28,22))
#mask = np.triu(np.ones_like(a_Ncorr,dtype=np.bool))
mask = np.zeros_like(a_Ncorr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(a_Ncorr, vmax=0.8, annot=True, mask=mask, square=True, fmt='.3f', annot_kws={'size':10})
ax.set_title('Correlation Matrix for Numerical Variable in Ames Housing Dataset', fontsize=25)
plt.show()
Our correlation matrix using the Kendall Rank correlation for categorical features suggests that there is not a lot of monotonicity between categorical features.
The only variables that show a degree of monotonicity are Kitchen Quality with External Quality and Garage Condition with Garage Quality.
ames_C=pd.concat([amesH_data[ordF], amesH_data[nomF], amesH_data['SalePrice']], axis=1)
a_Ccorr = ames_C.corr(method='kendall')
f, ax = plt.subplots(figsize=(25,18))
mask = np.zeros_like(a_Ccorr)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(a_Ccorr, vmax=0.8, square=True, annot=True, mask=mask, fmt='.2f')
ax.set_title('Correlation Matrix (Kendall Rank) for Categorical Variables in Ames Housing Dataset', fontsize=28)
plt.show()
The last thing that I wanted to do with the EDA was a quick pair plot just to get another look a bunch of the key variables.
sns.set()
sns.pairplot(amesH_data[ten_corr], height=2.5, grid_kws=dict(diag_sharey=False))
plt.show()
First, I remove extreme outliers as well as the 1st and 99th percentile of SalePrice. Next, I consolidate the levels of several nominal variables. I separate our dataset into object and nonObject features. I use a KNN imputer to fill the Na values for Lot Frontage. Make a quick check to make sure there are no Na values. I then one-hot encode the object features using .getdummies.
Next I take a brief look at some of the most skewed features among the nonObject features before performing log(price+1) transformation for features with greater than 0.6 abs skew value.
Finally, I perform a log(price+1) transformation on the target variable.
# Remove Outliers
amesH_data = amesH_data[amesH_data['Gr Liv Area'] < 4500]
amesH_data = amesH_data[amesH_data['Lot Area'] < 100000]
amesH_data = amesH_data[amesH_data['Total Bsmt SF'] < 5000]
amesH_data = amesH_data[amesH_data['Garage Yr Blt'] < 2011]
OutlierMinMax = np.quantile(amesH_data['SalePrice'], (.01,.99))
print(OutlierMinMax)
print(len(amesH_data[(amesH_data['SalePrice']<OutlierMinMax[0]) | (amesH_data['SalePrice'] > OutlierMinMax[1])]))
amesH_data = amesH_data[(amesH_data['SalePrice'] > OutlierMinMax[0]) & (amesH_data['SalePrice'] < OutlierMinMax[1])]
[ 61685.43 456854.13] 60
The index needs to be reset after removing records
amesH_data=amesH_data.reset_index(drop='True')
amesH_data.shape
(2862, 80)
target = amesH_data['SalePrice']
# Combining Gr Liv Area and Total Bsmt SF
amesH_data['Total SF'] = amesH_data['Gr Liv Area'] + amesH_data['Total Bsmt SF']
# Simplified Several Nominal Variables
amesH_data.loc[amesH_data['Neighborhood'] == 'Landmrk', 'Neighborhood'] = 'Sawyer'
amesH_data.loc[amesH_data['Neighborhood'] == 'GrnHill', 'Neighborhood'] = 'Timber'
amesH_data.loc[amesH_data['Neighborhood'] == 'Greens', 'Neighborhood'] = 'Somerst'
amesH_data.loc[amesH_data['Neighborhood'] == 'Blueste', 'Neighborhood'] = 'Crawfor'
amesH_data.loc[amesH_data['Neighborhood'] == 'NPkVill', 'Neighborhood'] = 'NAmes'
amesH_data.loc[amesH_data['Neighborhood'] == 'Veenker', 'Neighborhood'] = 'Somerst'
amesH_data.loc[amesH_data['Neighborhood'] == 'Blmngtn', 'Neighborhood'] = 'Gilbert'
amesH_data.loc[amesH_data['Neighborhood'] == 'BrDale', 'Neighborhood'] = 'NAmes'
amesH_data.loc[amesH_data['Neighborhood'] == 'MeadowV', 'Neighborhood'] = 'Mitchel'
amesH_data.loc[amesH_data['Neighborhood'] == 'ClearCr', 'Neighborhood'] = 'Edwards'
amesH_data.loc[amesH_data['Neighborhood'] == 'SWISU', 'Neighborhood'] = 'Edwards'
amesH_data.loc[amesH_data['Neighborhood'] == 'StoneBr', 'Neighborhood'] = 'Gilbert'
amesH_data.loc[amesH_data['Neighborhood'] == 'NoRidge', 'Neighborhood'] = 'NridgHt'
foundation=['BrkTil', 'Stone', 'Wood', 'Slab']
for f in foundation:
amesH_data.loc[amesH_data['Foundation'] == f, 'Foundation'] = 'Other'
rail = ['RRNn', 'RRNe', 'RRAe', 'RRAn']
positive = ['PosA', 'PosN']
for r in rail:
amesH_data.loc[amesH_data['Condition 1'] == r, 'Condition 1'] = 'Railway'
amesH_data.loc[amesH_data['Condition 2'] == r, 'Condition 2'] = 'Railway'
for p in positive:
amesH_data.loc[amesH_data['Condition 1'] == p, 'Condition 1'] = 'PositiveAm'
amesH_data.loc[amesH_data['Condition 2'] == p, 'Condition 2'] = 'PositiveAm'
other = ['ImStucc', 'PreCast', 'Stone', 'CBlock', 'AsphShn', 'BrkComm']
for o in other:
amesH_data.loc[amesH_data['Exterior 1st'] == o, 'Exterior 1st'] = 'Other'
amesH_data.loc[amesH_data['Exterior 2nd'] == o, 'Exterior 2nd'] = 'Other'
I had originally filled Na values for Lot Frontage with the median but I decided later to use a KNN imputer for these values.
# Alternative to KNN imputer
# frontageMed = amesH_data['Lot Frontage'].median()
# amesH_data.loc[:, 'Lot Frontage'] = amesH_data.loc[:, 'Lot Frontage'].fillna(frontageMed)
obj_feat = amesH_data.select_dtypes(include=['object']).columns
nonObj_feat = amesH_data.select_dtypes(exclude = ['object']).columns
nonObj_feat = nonObj_feat.drop('SalePrice')
amesH_data_nonObjF = amesH_data[nonObj_feat]
amesH_data_objF = amesH_data[obj_feat]
amesH_data_objF.shape
(2862, 22)
amesH_data_nonObjF.shape
(2862, 58)
print(amesH_data_objF.isnull().values.sum())
print(amesH_data_nonObjF.isnull().values.sum())
0 484
#KNN imputer - impute values for Lot Frontage NAs
imputer = KNNImputer(n_neighbors=4)
imputer.fit_transform(amesH_data_nonObjF)
Frontage_Impute = imputer.transform(amesH_data_nonObjF)
amesH_data_nonObjF = pd.DataFrame(imputer.fit_transform(amesH_data_nonObjF), columns = amesH_data_nonObjF.columns)
print(amesH_data_nonObjF.isnull().values.sum())
0
skewness = amesH_data_nonObjF.apply(lambda x:skew(x))
skewness = skewness[abs(skewness) > 0.6]
print(skewness.sort_values().head(10))
print(skewness.sort_values().tail(20))
Utilities -43.03017 Street -18.83488 Functional -5.09800 Land Slope -5.08796 Electrical -4.33723 Bsmt Cond -3.57597 Garage Cond -3.52952 Garage Qual -3.41370 Paved Drive -3.14173 Bsmt Qual -1.28650 dtype: float64 Lot Frontage 0.99734 Bsmt Exposure 1.26285 Exter Cond 1.52184 Fence 1.73925 Wood Deck SF 1.80910 Open Porch SF 2.46692 Mas Vnr Area 2.56967 BsmtFin Type 2 3.13578 Lot Area 3.51691 Bsmt Half Bath 3.92933 Screen Porch 4.00615 Enclosed Porch 4.01926 BsmtFin SF 2 4.08214 Alley 4.14758 Kitchen AbvGr 4.29006 3Ssn Porch 11.26195 Low Qual Fin SF 12.32720 Pool Area 18.68478 Pool QC 18.71996 Misc Val 22.49727 dtype: float64
skewed_feat = skewness.index
amesH_data_nonObjF = amesH_data_nonObjF.copy()
amesH_data_nonObjF[skewed_feat] = np.log1p(amesH_data_nonObjF[skewed_feat])
y = target
y = np.log1p(y)
Normal probability plot shows that the log transformation has helped to make Sale Price better approximate a normal distribution.
plt.figure(figsize=(11,7))
probplot(y, plot=plt)
plt.show()
amesH_data_objF=pd.get_dummies(amesH_data_objF)
amesH_data_nonObjF = amesH_data_nonObjF.reset_index(drop=True)
amesH_data_objF = amesH_data_objF.reset_index(drop=True)
print(amesH_data_objF.shape, amesH_data_nonObjF.shape)
(2862, 162) (2862, 58)
y.shape
(2862,)
We concatenate our object and nonObject features and then create train and test sets using 70% for train and 30% for test.
X = pd.concat([amesH_data_nonObjF, amesH_data_objF], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print('X_train: ' + str(X_train.shape))
print('X_test: ' + str(X_test.shape))
print('y_train: ' + str(y_train.shape))
print('y_test: ' + str(y_test.shape))
X_train: (2003, 220) X_test: (859, 220) y_train: (2003,) y_test: (859,)
Next we standardized our data to prevent large discrepencies in scale between various features causing distortions. Note this is done after train/test split in order to prevent data leakage. Also, while fit_transform is used on the X_train data, transform is used on the X_test data to again ensure no data leakage.
stdSc = StandardScaler()
X_train = X_train.copy()
X_test = X_test.copy()
X_train.loc[:, nonObj_feat] = stdSc.fit_transform(X_train.loc[:, nonObj_feat])
X_test.loc[:, nonObj_feat] = stdSc.transform(X_test.loc[:, nonObj_feat])
X_test.head()
| Lot Frontage | Lot Area | Street | Alley | Lot Shape | Utilities | Land Slope | Overall Qual | Overall Cond | Year Built | Year Remod/Add | Mas Vnr Area | Exter Qual | Exter Cond | Bsmt Qual | Bsmt Cond | Bsmt Exposure | BsmtFin Type 1 | BsmtFin SF 1 | BsmtFin Type 2 | BsmtFin SF 2 | Bsmt Unf SF | Total Bsmt SF | Heating QC | Electrical | 1st Flr SF | 2nd Flr SF | Low Qual Fin SF | Gr Liv Area | Bsmt Full Bath | Bsmt Half Bath | Full Bath | Half Bath | Bedroom AbvGr | Kitchen AbvGr | Kitchen Qual | TotRms AbvGrd | Functional | Fireplaces | Fireplace Qu | Garage Yr Blt | Garage Finish | Garage Cars | Garage Area | Garage Qual | Garage Cond | Paved Drive | Wood Deck SF | Open Porch SF | Enclosed Porch | 3Ssn Porch | Screen Porch | Pool Area | Pool QC | Fence | Misc Val | Yr Sold | Total SF | MS SubClass_MSC120 | MS SubClass_MSC150 | MS SubClass_MSC160 | MS SubClass_MSC180 | MS SubClass_MSC20 | MS SubClass_MSC30 | MS SubClass_MSC40 | MS SubClass_MSC45 | MS SubClass_MSC50 | MS SubClass_MSC60 | MS SubClass_MSC70 | MS SubClass_MSC75 | MS SubClass_MSC80 | MS SubClass_MSC85 | MS SubClass_MSC90 | MS SubClass_SC190 | MS Zoning_A (agr) | MS Zoning_C (all) | MS Zoning_FV | MS Zoning_I (all) | MS Zoning_RH | MS Zoning_RL | MS Zoning_RM | Land Contour_Bnk | Land Contour_HLS | Land Contour_Low | Land Contour_Lvl | Lot Config_Corner | Lot Config_CulDSac | Lot Config_FR2 | Lot Config_FR3 | Lot Config_Inside | Neighborhood_BrkSide | Neighborhood_CollgCr | Neighborhood_Crawfor | Neighborhood_Edwards | Neighborhood_Gilbert | Neighborhood_IDOTRR | Neighborhood_Mitchel | Neighborhood_NAmes | Neighborhood_NWAmes | Neighborhood_NridgHt | Neighborhood_OldTown | Neighborhood_Sawyer | Neighborhood_SawyerW | Neighborhood_Somerst | Neighborhood_Timber | Condition 1_Artery | Condition 1_Feedr | Condition 1_Norm | Condition 1_PositiveAm | Condition 1_Railway | Condition 2_Artery | Condition 2_Feedr | Condition 2_Norm | Condition 2_PositiveAm | Condition 2_Railway | Bldg Type_1Fam | Bldg Type_2fmCon | Bldg Type_Duplex | Bldg Type_Twnhs | Bldg Type_TwnhsE | House Style_1.5Fin | House Style_1.5Unf | House Style_1Story | House Style_2.5Fin | House Style_2.5Unf | House Style_2Story | House Style_SFoyer | House Style_SLvl | Roof Style_Flat | Roof Style_Gable | Roof Style_Gambrel | Roof Style_Hip | Roof Style_Mansard | Roof Style_Shed | Roof Matl_CompShg | Roof Matl_Membran | Roof Matl_Metal | Roof Matl_Roll | Roof Matl_Tar&Grv | Roof Matl_WdShake | Roof Matl_WdShngl | Exterior 1st_AsbShng | Exterior 1st_BrkFace | Exterior 1st_CemntBd | Exterior 1st_HdBoard | Exterior 1st_MetalSd | Exterior 1st_Other | Exterior 1st_Plywood | Exterior 1st_Stucco | Exterior 1st_VinylSd | Exterior 1st_Wd Sdng | Exterior 1st_WdShing | Exterior 2nd_AsbShng | Exterior 2nd_Brk Cmn | Exterior 2nd_BrkFace | Exterior 2nd_CmentBd | Exterior 2nd_HdBoard | Exterior 2nd_MetalSd | Exterior 2nd_Other | Exterior 2nd_Plywood | Exterior 2nd_Stucco | Exterior 2nd_VinylSd | Exterior 2nd_Wd Sdng | Exterior 2nd_Wd Shng | Mas Vnr Type_BrkCmn | Mas Vnr Type_BrkFace | Mas Vnr Type_CBlock | Mas Vnr Type_None | Mas Vnr Type_Stone | Foundation_CBlock | Foundation_Other | Foundation_PConc | Heating_Floor | Heating_GasA | Heating_GasW | Heating_Grav | Heating_OthW | Heating_Wall | Central Air_N | Central Air_Y | Garage Type_2Types | Garage Type_Attchd | Garage Type_Basment | Garage Type_BuiltIn | Garage Type_CarPort | Garage Type_Detchd | Garage Type_None | Misc Feature_Gar2 | Misc Feature_None | Misc Feature_Othr | Misc Feature_Shed | Misc Feature_TenC | Mo Sold_Apr | Mo Sold_Aug | Mo Sold_Dec | Mo Sold_Feb | Mo Sold_Jan | Mo Sold_Jul | Mo Sold_Jun | Mo Sold_Mar | Mo Sold_May | Mo Sold_Nov | Mo Sold_Oct | Mo Sold_Sep | Sale Type_COD | Sale Type_CWD | Sale Type_Con | Sale Type_ConLD | Sale Type_ConLI | Sale Type_ConLw | Sale Type_New | Sale Type_Oth | Sale Type_VWD | Sale Type_WD | Sale Condition_Abnorml | Sale Condition_AdjLand | Sale Condition_Alloca | Sale Condition_Family | Sale Condition_Normal | Sale Condition_Partial | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1568 | 0.37881 | 0.72212 | 0.05481 | -0.25617 | -0.98740 | 0.02943 | 0.19865 | -0.08800 | 1.27581 | -1.08548 | 0.02159 | -0.79150 | -0.70786 | -0.21022 | 0.48492 | 0.15713 | 0.68928 | 0.66588 | 0.75821 | -0.24757 | -0.36749 | 0.09426 | -0.14648 | 0.87175 | 0.24316 | -0.34592 | 1.16891 | -0.11173 | 0.67671 | 1.13453 | -0.24030 | 0.78711 | -0.75523 | 1.43250 | -0.20930 | -0.78500 | 0.47809 | -4.43749 | -0.98797 | -0.97916 | 0.46453 | -0.80978 | -1.02740 | -0.22907 | 0.26038 | 0.25412 | 0.28519 | 1.18716 | -1.09097 | 2.44022 | -0.10499 | -0.30477 | -0.06322 | -0.06265 | -0.48445 | 5.92911 | 0.16373 | 0.27378 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 547 | 1.00202 | 1.17495 | 0.05481 | -0.25617 | -0.98740 | 0.02943 | 0.19865 | 1.39950 | 1.98639 | 0.26481 | -0.21924 | 1.02881 | 1.09382 | 2.34556 | 0.48492 | 1.08786 | 2.02822 | 0.66588 | 1.08380 | 2.05234 | 2.02571 | -0.18708 | 2.34039 | -0.17133 | 0.24316 | 1.92778 | -0.86162 | -0.11173 | 1.13202 | 1.13453 | -0.24030 | 0.78711 | -0.75523 | -1.06798 | -0.20930 | 0.75502 | -0.19495 | -6.53043 | 1.81444 | 1.23695 | 0.12582 | 0.31660 | 1.66970 | 1.89141 | 0.26038 | 0.25412 | 0.28519 | 1.43183 | -1.09097 | -0.42340 | 9.87583 | -0.30477 | -0.06322 | -0.06265 | 2.05290 | -0.18195 | 0.92638 | 2.04566 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 33 | -2.84084 | -2.69168 | 0.05481 | -0.25617 | 0.68680 | 0.02943 | 0.19865 | 0.65575 | 0.47022 | 0.09752 | -0.46008 | -0.79150 | -0.70786 | -0.21022 | 0.48492 | 0.15713 | -0.64966 | -1.21996 | -1.43412 | -0.24757 | -0.36749 | 0.59455 | -0.50105 | 0.87175 | 0.24316 | -0.85919 | -0.86162 | -0.11173 | -1.71643 | -0.83958 | -0.24030 | -1.04826 | -0.75523 | -1.06798 | -0.20930 | -0.78500 | -1.89085 | 0.21699 | -0.98797 | -0.97916 | -0.06302 | -0.80978 | -1.02740 | -0.78203 | 0.26038 | 0.25412 | 0.28519 | -0.94284 | 0.50833 | -0.42340 | -0.10499 | -0.30477 | -0.06322 | -0.06265 | -0.48445 | -0.18195 | 1.68904 | -1.17794 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2576 | -0.25785 | -0.91853 | 0.05481 | -0.25617 | 0.68680 | 0.02943 | 0.19865 | -1.57551 | 0.47022 | -1.77109 | -0.21924 | -0.79150 | -0.70786 | -0.21022 | -0.28112 | -1.04279 | -0.64966 | -0.74850 | 0.47419 | -0.24757 | -0.36749 | 0.61128 | 0.24911 | -1.21441 | 0.24316 | 0.14308 | 1.04440 | -0.11173 | 0.49977 | -0.83958 | -0.24030 | -1.04826 | -0.75523 | 1.43250 | -0.20930 | -2.32502 | 1.60279 | 0.21699 | 0.78015 | 0.68292 | -2.17239 | -0.80978 | -1.02740 | -1.48404 | -0.67653 | -0.68264 | -3.79882 | -0.94284 | -1.09097 | -0.42340 | -0.10499 | -0.30477 | -0.06322 | -0.06265 | -0.48445 | -0.18195 | -1.36159 | 0.37092 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 852 | -1.64748 | -1.35744 | 0.05481 | -0.25617 | 0.68680 | 0.02943 | 0.19865 | -0.08800 | -0.45977 | 1.02893 | 0.93677 | 1.17742 | 1.09382 | -0.21022 | 0.48492 | 0.15713 | 1.47251 | 1.13734 | 0.77074 | -0.24757 | -0.36749 | -0.26465 | -0.47211 | 0.87175 | 0.24316 | -0.81404 | -0.86162 | -0.11173 | -1.67027 | 1.13453 | -0.24030 | -1.04826 | -0.75523 | -2.31821 | -0.20930 | 0.75502 | -1.89085 | 0.21699 | -0.98797 | -0.97916 | 0.98839 | 1.44298 | 0.32115 | -0.24349 | 0.26038 | 0.25412 | 0.28519 | 0.97182 | -1.09097 | -0.42340 | -0.10499 | -0.30477 | -0.06322 | -0.06265 | -0.48445 | -0.18195 | 0.92638 | -1.14510 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
Based on the correlation matrices above I have selected 16 variables for a preliminary multiple linear regression model. I took the best correlated variables
Basic Features:
basicFeatures = ['Overall Qual', 'Gr Liv Area', 'Exter Qual', 'Total Bsmt SF', 'Garage Area', 'Bsmt Qual', 'Year Built', 'Garage Finish', 'Full Bath', 'Year Remod/Add', 'Fireplace Qu', 'Mas Vnr Area', 'Heating QC', 'BsmtFin SF 1', 'Lot Frontage', 'Lot Area']
In this section I try two different methods of feature selection using the sklearn feature.selection module's SelectFromModel.
By default the random forest regressor will identify features that produce significantly higher decreases in mean squared error than an average of all features.
Through trial and error I found that progressively lowering max features (@each split), max depth of trees, and max samples (number of samples drawn from X to train each base estimator) works best. This resulted in progressively better RSME scores particularly on the training data(see final model below). However, when I pushed these values too low, suddenly the RSME for test shot up to 50,000.
In terms of number of trees, I found that 50,000 trees produced very slightly better results than 10,000 but taking about double the amount of time. As such I went with the 10,000 trees.
Through all my trials the threshold for selection stayed pretty stable at 0.0038
rf_featSelect = SelectFromModel(RandomForestRegressor(n_estimators = 10000, max_features=10, random_state=0, max_depth=5, max_samples=10,n_jobs=-1))
rf_featSelect.fit(X_train, y_train)
select_feat_rf = X_train.columns[(rf_featSelect.get_support())]
print('select_feat_rf')
print('Threshold for selection: ', rf_featSelect.threshold_)
X_select = rf_featSelect.transform(X_train)
importances = rf_featSelect.estimator_.feature_importances_
sorted_indicies = np.argsort(importances)[::-1]
importantFeatureCount = X_select.shape[1]
X_train.columns[sorted_indicies][:X_select.shape[1]]
select_feat_rf Threshold for selection: 0.004545454545454545
Index(['Total SF', 'Gr Liv Area', 'Overall Qual', 'Garage Area', 'Year Built',
'Total Bsmt SF', '1st Flr SF', 'Garage Yr Blt', 'Year Remod/Add',
'Lot Area', 'Exter Qual', 'Garage Cars', 'Bsmt Qual', 'Lot Frontage',
'Kitchen Qual', 'TotRms AbvGrd', 'Fireplace Qu', 'Open Porch SF',
'Garage Finish', 'BsmtFin SF 1', 'Bsmt Unf SF', 'Full Bath',
'BsmtFin Type 1', 'Mas Vnr Area', 'Heating QC', 'Wood Deck SF',
'Foundation_PConc', 'Fireplaces', '2nd Flr SF', 'Yr Sold',
'Bedroom AbvGr', 'Overall Cond', 'Bsmt Exposure', 'Garage Type_Attchd',
'Foundation_CBlock', 'Mas Vnr Type_None', 'Garage Type_Detchd',
'Exterior 1st_VinylSd', 'Lot Shape', 'Exterior 2nd_VinylSd',
'Half Bath', 'Bsmt Full Bath', 'Neighborhood_NridgHt',
'MS SubClass_MSC60', 'MS Zoning_RL', 'Roof Style_Hip', 'MS Zoning_RM',
'Mas Vnr Type_BrkFace', 'House Style_2Story', 'Roof Style_Gable'],
dtype='object')
The number of selected features went up considerably with hyperparameter tuning. As the number of features increased, the relative importance of most important features dropped from for Overall Quality and Total Indoor Square footage.
I think that by applying a more rigorous method of hyperparameter fine tuning it may be possible to further optimize this selector.
RF_features=pd.Series(importances[sorted_indicies][:importantFeatureCount], index= X_train.columns[sorted_indicies][:importantFeatureCount])
RF_features
Total SF 0.04130 Gr Liv Area 0.03657 Overall Qual 0.03557 Garage Area 0.03363 Year Built 0.03199 Total Bsmt SF 0.03103 1st Flr SF 0.03008 Garage Yr Blt 0.02897 Year Remod/Add 0.02699 Lot Area 0.02304 Exter Qual 0.02164 Garage Cars 0.02154 Bsmt Qual 0.02147 Lot Frontage 0.02135 Kitchen Qual 0.02050 TotRms AbvGrd 0.01825 Fireplace Qu 0.01816 Open Porch SF 0.01787 Garage Finish 0.01783 BsmtFin SF 1 0.01743 Bsmt Unf SF 0.01721 Full Bath 0.01629 BsmtFin Type 1 0.01609 Mas Vnr Area 0.01527 Heating QC 0.01465 Wood Deck SF 0.01363 Foundation_PConc 0.01311 Fireplaces 0.01289 2nd Flr SF 0.01246 Yr Sold 0.00949 Bedroom AbvGr 0.00902 Overall Cond 0.00864 Bsmt Exposure 0.00857 Garage Type_Attchd 0.00788 Foundation_CBlock 0.00774 Mas Vnr Type_None 0.00731 Garage Type_Detchd 0.00729 Exterior 1st_VinylSd 0.00715 Lot Shape 0.00671 Exterior 2nd_VinylSd 0.00671 Half Bath 0.00638 Bsmt Full Bath 0.00617 Neighborhood_NridgHt 0.00561 MS SubClass_MSC60 0.00554 MS Zoning_RL 0.00520 Roof Style_Hip 0.00498 MS Zoning_RM 0.00494 Mas Vnr Type_BrkFace 0.00488 House Style_2Story 0.00478 Roof Style_Gable 0.00455 dtype: float64
rf_top_features = pd.Series(importances[sorted_indicies][:importantFeatureCount], index= X_train.columns[sorted_indicies][:importantFeatureCount])
f, ax = plt.subplots(figsize=(25,14))
plt.title('Random Forest Feature Importance', fontsize=25)
plt.barh(range(X_select.shape[1]), importances[sorted_indicies][:importantFeatureCount])
plt.yticks(range(X_select.shape[1]), X_train.columns[sorted_indicies][:importantFeatureCount])
plt.tight_layout
ax.invert_yaxis()
plt.show()
For the LassoCV regressor, I set intial alpha values then test them to obtain optimal alpha scores.
lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter = 50000, cv = 10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print('Best alpha: ', alpha)
Best alpha: 0.0006
lasso_A = LassoCV(alphas = [alpha*0.6, alpha * 0.65, alpha * 0.7, alpha * 0.75, alpha * 0.8, alpha * 0.9, alpha * 0.95, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], max_iter=50000, cv = 10)
I then take my improved model and use it with SelectFromModel. The key hyperparameter here is the threshold value which represents the cut off in terms of significant features. At a high level such as 0.25, the lasso model returns zero features. In order to tune the threshold parameter, I implemented a while loop that progressively lowered the value until 61 features were obtained.
lassofeatSelect2 = SelectFromModel(lasso_A, threshold=0.25)
lassofeatSelect2.fit(X_train,y_train)
n_features = lassofeatSelect2.transform(X_train).shape[1]
while n_features < 61:
lassofeatSelect2.threshold -=0.001
X_transform = lassofeatSelect2.transform(X_train)
n_features = X_transform.shape[1]
print('Number of features:', n_features, 'n\threshold: ', lassofeatSelect2.threshold)
select_feat_lasso = X_train.columns[(lassofeatSelect2.get_support())]
Number of features: 61 n hreshold: 0.004999999999999785
Here is the list of the top coefficients.
coeffs = pd.Series(lassofeatSelect2.estimator_.coef_, index=X_train.columns)
key_coeffs = pd.concat([coeffs.sort_values().head(20), coeffs.sort_values().tail(41)])
key_coeffs
Sale Condition_Abnorml -0.06635 Condition 1_Artery -0.04179 MS Zoning_RM -0.03163 Neighborhood_Mitchel -0.03056 Neighborhood_CollgCr -0.02141 Neighborhood_SawyerW -0.01904 Sale Condition_Family -0.01709 Exterior 2nd_Plywood -0.01690 Neighborhood_NWAmes -0.01633 Neighborhood_Edwards -0.01416 Kitchen AbvGr -0.01032 Bsmt Unf SF -0.00947 Central Air_N -0.00945 MS SubClass_MSC60 -0.00927 Garage Type_CarPort -0.00730 Lot Config_FR2 -0.00659 Bsmt Cond -0.00621 Bedroom AbvGr -0.00619 Total Bsmt SF -0.00510 Mo Sold_Mar -0.00509 Paved Drive 0.00490 Garage Yr Blt 0.00608 Mas Vnr Type_Stone 0.00657 BsmtFin Type 1 0.00688 Full Bath 0.00776 Mo Sold_Jul 0.00790 MS SubClass_MSC20 0.00881 Exter Qual 0.00919 Garage Cars 0.00946 Screen Porch 0.01003 Foundation_PConc 0.01079 Neighborhood_Timber 0.01135 Bsmt Qual 0.01173 Land Contour_HLS 0.01176 Heating QC 0.01176 BsmtFin SF 1 0.01442 Bsmt Exposure 0.01458 MS Zoning_FV 0.01487 Year Remod/Add 0.01585 Fireplaces 0.01602 Bsmt Full Bath 0.01616 Kitchen Qual 0.01635 Functional 0.01790 Garage Area 0.01938 MS SubClass_MSC70 0.02176 Condition 1_PositiveAm 0.02472 Bldg Type_1Fam 0.02611 Neighborhood_Somerst 0.02765 MS SubClass_MSC120 0.03256 Neighborhood_BrkSide 0.03527 Condition 1_Norm 0.03593 Overall Cond 0.03932 Sale Type_New 0.03971 Lot Area 0.03978 Neighborhood_NridgHt 0.04594 Exterior 1st_BrkFace 0.05168 Year Built 0.05748 Overall Qual 0.06838 Gr Liv Area 0.07540 Neighborhood_Crawfor 0.08481 Total SF 0.08544 dtype: float64
The contrast between the variables selected by the Random Forest selector and the Lasso selector is very interesting.
The Random Forest selector tended to pick mainly numeric(nonObj) variables and the most important of these tend to be somewhat similar to what we might expect based on correlations with Sale Price.
While the Lasso Selector did choose variables such as Total SF and Overall Quality, it also chose a large number of categorical(obj) variables. Some of these were judged to be very important such as Neighborhood_Crawford, Exterior 1st_BrkFace, and Sale Condition Abnormal. I found it particularly interesting that Neighborhoods assumed such an increased importance as this group of features had been ignored by the Random Forest Selector.
I believe this points to the Random Forest selector favoring numeric variables with a high level of cardinality.
coeffs = pd.Series(lassofeatSelect2.estimator_.coef_, index=X_train.columns)
key_coeffs = pd.concat([coeffs.sort_values().head(20), coeffs.sort_values().tail(41)])
f, ax = plt.subplots(figsize = (25,16))
key_coeffs.plot(kind='barh')
plt.title('Coefficients for Lasso Regression', fontsize=25)
plt.show()
Main Feature Selection
Here I set up my main feature selection which includes 40 variables. The commented out lists of features represent alternative selections of features that I had tried earlier.
# CombFeat = ['Total SF', 'Overall Qual', 'Year Built', 'Garage Area', 'Year Remod/Add', 'Bsmt Qual', 'Lot Area', 'Heating QC', 'Overall Cond', 'Kitchen Qual', 'Exter Qual', 'Mas Vnr Area', 'Full Bath', 'Garage Finish', 'Sale Condition_Abnorml', 'Neighborhood_Crawfor', 'Exterior 1st_BrkFace', 'Sale Condition_Family', 'Sale Type_New', 'Exterior 1st_AsbShng', 'Neighborhood_BrkSide', 'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Central Air_N', 'Bsmt Exposure_3', 'Condition 1_PosN', 'MS Zoning_FV', 'Condition 1_Norm', 'MS Zoning_RL', 'Functional', 'Neighborhood_StoneBr', 'MS Zoning_C (all)', 'Condition 1_Artery', 'BsmtFin SF 1', 'Mas Vnr Type_Stone','Neighborhood_SawyerW', 'Neighborhood_Gilbert', 'Neighborhood_Edwards', 'Neighborhood_IDOTRR']
# Feat_Select_N = ['Total SF', 'Overall Qual', 'Year Built', 'Garage Area', 'Year Remod/Add', 'Bsmt Qual', 'Lot Area', 'Heating QC', 'Overall Cond', 'Kitchen Qual', 'Exter Qual', 'BsmtFin SF 1','Mas Vnr Area', 'Full Bath', 'Garage Finish', 'Neighborhood_NAmes', 'Neighborhood_Edwards', 'Neighborhood_CollgCr', 'Neighborhood_Gilbert', 'Neighborhood_OldTown', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_Sawyer', 'Neighborhood_Mitchel', 'Neighborhood_NWAmes', 'Neighborhood_SawyerW', 'Neighborhood_Crawfor', 'Neighborhood_BrkSide', 'Neighborhood_IDOTRR', 'Neighborhood_Timber','Foundation_PConc','Foundation_CBlock','Sale Condition_Abnorml', 'Sale Condition_Family', 'Sale Type_New', 'Central Air_N', 'Exterior 1st_BrkFace', 'Exterior 1st_AsbShng', 'Condition 1_Positive']
Feat_Select = ['Total SF', 'Overall Qual', 'Year Built', 'Garage Area', 'Year Remod/Add', 'Bsmt Qual', 'Lot Area', 'Heating QC', 'Overall Cond', 'Kitchen Qual', 'Exter Qual', 'BsmtFin SF 1','Mas Vnr Area', 'Full Bath', 'Garage Finish', 'Neighborhood_NAmes', 'Neighborhood_Edwards', 'Neighborhood_CollgCr', 'Neighborhood_Gilbert', 'Neighborhood_OldTown', 'Neighborhood_NridgHt', 'Neighborhood_Somerst', 'Neighborhood_Sawyer', 'Neighborhood_Mitchel', 'Neighborhood_NWAmes', 'Neighborhood_SawyerW', 'Neighborhood_Crawfor', 'Neighborhood_BrkSide', 'Neighborhood_IDOTRR', 'Neighborhood_Timber','Foundation_PConc','Foundation_CBlock', 'Foundation_Other', 'Sale Condition_Abnorml', 'Sale Condition_Family', 'Sale Type_New', 'Central Air_N', 'Exterior 1st_BrkFace', 'Exterior 1st_AsbShng', 'Condition 1_PositiveAm' ]
len(Feat_Select)
40
In this section I create a series of basic multiple regression models with different selections of features.
Before running the preliminary models I set up my K-Fold Cross Validation Scoring Metrics (Root Mean Squared Error, Mean Absolute Error,Adjusted R2).
scorer_rmse = make_scorer(mean_squared_error, greater_is_better=False)
scorer_mae = make_scorer(mean_absolute_error, greater_is_better=False)
def rmse_cv_train(model, X_train,cv=10, nj=-1, pred=24):
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer_rmse, cv=cv, pre_dispatch=pred, n_jobs=nj))
return(rmse)
def rmse_cv_test(model, X_test,cv=10, nj=-1, pred=24):
rmse = np.sqrt(-cross_val_score(model, X_test, y_test, scoring = scorer_rmse, cv=cv, pre_dispatch=pred, n_jobs=nj))
return(rmse)
def mae_cv_train(model, X_train, cv=10, nj=-1, pred=24):
mae = -cross_val_score(model, X_train, y_train, scoring = scorer_mae, cv=cv, pre_dispatch=pred, n_jobs=nj)
return(mae)
def mae_cv_test(model, X_test, cv=10, nj=-1, pred=24):
mae=-cross_val_score(model, X_test, y_test, scoring= scorer_mae, cv =cv, pre_dispatch=pred, n_jobs=nj)
return(mae)
def r2adj_cv_train(model,X_train, cv=10, nj=-1, pred=24):
r2 = cross_val_score(model, X_train, y_train, scoring='r2', cv=cv, pre_dispatch=pred, n_jobs=nj)
adj_r2 = 1-(1-r2) * (len(y_train)-1)/len(y_train)
return adj_r2
def r2adj_cv_test(model,X_test, cv=10, nj=-1, pred=24):
r2 = cross_val_score(model, X_test, y_test, scoring='r2', cv=cv, pre_dispatch=pred, n_jobs=nj)
adj_r2 = 1-(1-r2) * (len(y_test)-1)/len(y_test)
return adj_r2
Originally, I had intended to include a Multiple Linear Regression model that included all 81 features. This though did not work at all and resulted in a broken model. While the RSME for the test data was fairly normal, the RSME for the test data ballooned . The R2 for the training data was negative which indicates that the model is substantially worse than the mean value of Sale Price as a predictor.
lr_all = LinearRegression()
lr_all.fit(X_train, y_train)
LinearRegression()
rmse_cv_tr = rmse_cv_train(lr_all, X_train).mean()
rmse_cv_tst = rmse_cv_test(lr_all, X_test).mean()
mae_cv_tr = mae_cv_train(lr_all, X_train).mean()
mae_cv_tst = mae_cv_test(lr_all, X_test).mean()
r2adj_cv_tr = r2adj_cv_train(lr_all, X_train).mean()
r2adj_cv_tst = r2adj_cv_test(lr_all, X_test).mean()
pre_results = pd.DataFrame({'Linear Regression':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst}})
pre_results
| Linear Regression | |
|---|---|
| AdjR2 CV Test | -1126930666536769945600.00000 |
| AdjR2 CV Train | -4758674034797168640.00000 |
| MAE CV Test | 658335239.48019 |
| MAE CV Train | 21855952.03904 |
| RMSE CV Test | 5750750202.38194 |
| RMSE CV Train | 300810559.93264 |
y_train_pred = lr_all.predict(X_train)
y_test_pred = lr_all.predict(X_test)
Extremely strange residuals plot and fitted-line chart.
plt.scatter(y_train_pred, y_train_pred - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_test_pred, y_test_pred - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Linear Regression with All Features')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10.5, xmax=13.5, color='purple')
plt.show()
plt.scatter(y_train_pred, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_test_pred, y_test, c = 'deeppink', marker='.', label = 'Test Data')
plt.title('Linear Regression with All Feature')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
As an alternative for my base linear model, I took the top variables correlated with Sale Price minus several variables that had a high level of collinearity. This left me with 16 variables for the base model.
Overall, the base model performed fairly well.
lr_basic = LinearRegression()
lr_basic.fit(X_train[basicFeatures], y_train)
LinearRegression()
rmse_cv_tr = rmse_cv_train(lr_basic, X_train[basicFeatures]).mean()
rmse_cv_tst = rmse_cv_test(lr_basic, X_test[basicFeatures]).mean()
mae_cv_tr = mae_cv_train(lr_basic, X_train[basicFeatures]).mean()
mae_cv_tst = mae_cv_test(lr_basic, X_test[basicFeatures]).mean()
r2adj_cv_tr = r2adj_cv_train(lr_basic, X_train[basicFeatures]).mean()
r2adj_cv_tst = r2adj_cv_test(lr_basic, X_test[basicFeatures]).mean()
basic_results = pd.DataFrame({'LR BasicFeatures':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst}})
basic_results
| LR BasicFeatures | |
|---|---|
| AdjR2 CV Test | 0.88048 |
| AdjR2 CV Train | 0.88951 |
| MAE CV Test | 0.08844 |
| MAE CV Train | 0.09001 |
| RMSE CV Test | 0.11922 |
| RMSE CV Train | 0.12347 |
pre_results = pd.concat([pre_results, basic_results], axis=1)
y_train_pred_basic = lr_basic.predict(X_train[basicFeatures])
y_test_pred_basic = lr_basic.predict(X_test[basicFeatures])
We can see that the residuals are fairly tightly packed around the mean and there is no discernable pattern.
plt.scatter(y_train_pred_basic, y_train_pred_basic - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_test_pred_basic, y_test_pred_basic - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Linear Regression with Basic Features')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10.5, xmax=13.5, color='purple')
plt.show()
Below we can see that the model works fairly well.
plt.scatter(y_train_pred_basic, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_test_pred_basic, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Linear regression with Basic Features')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
The multiple regression model with Random Forest selected variables does perform better pushing the RSME on the test data to 0.1156826. Of course, it includes a far larger number of features(71). Again the residuals are tighly grouped around the mean error.
lr_rfSelect = LinearRegression()
lr_rfSelect.fit(X_train[select_feat_rf], y_train)
LinearRegression()
rmse_cv_tr = rmse_cv_train(lr_rfSelect, X_train[select_feat_rf]).mean()
rmse_cv_tst = rmse_cv_test(lr_rfSelect, X_test[select_feat_rf]).mean()
mae_cv_tr = mae_cv_train(lr_rfSelect, X_train[select_feat_rf]).mean()
mae_cv_tst = mae_cv_test(lr_rfSelect, X_test[select_feat_rf]).mean()
r2adj_cv_tr = r2adj_cv_train(lr_rfSelect, X_train[select_feat_rf]).mean()
r2adj_cv_tst = r2adj_cv_test(lr_rfSelect, X_test[select_feat_rf]).mean()
results_rffeat = pd.DataFrame({'LR RForestFeat Select':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst}})
results_rffeat
| LR RForestFeat Select | |
|---|---|
| AdjR2 CV Test | 0.90745 |
| AdjR2 CV Train | 0.90728 |
| MAE CV Test | 0.07883 |
| MAE CV Train | 0.08188 |
| RMSE CV Test | 0.10443 |
| RMSE CV Train | 0.11277 |
pre_results = pd.concat([pre_results, results_rffeat], axis=1)
y_train_rfF = lr_rfSelect.predict(X_train[select_feat_rf])
y_test_rfF = lr_rfSelect.predict(X_test[select_feat_rf])
plt.scatter(y_train_rfF, y_train_rfF - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_test_rfF, y_test_rfF - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Linear Regression with Rf Feature Selection')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10.5, xmax=13.5, color='purple')
plt.show()
plt.scatter(y_train_rfF, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_test_rfF, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Linear regression with Rf Feature Selection')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
The last of the preliminary models creates a multiple regression model using the features picked by the Lasso selector. This set of features gives us the best results thus far.
lr_lasSelect = LinearRegression()
lr_lasSelect.fit(X_train[select_feat_lasso], y_train)
LinearRegression()
rmse_cv_tr = rmse_cv_train(lr_lasSelect, X_train[select_feat_lasso]).mean()
rmse_cv_tst = rmse_cv_test(lr_lasSelect, X_test[select_feat_lasso]).mean()
mae_cv_tr = mae_cv_train(lr_lasSelect, X_train[select_feat_lasso]).mean()
mae_cv_tst = mae_cv_test(lr_lasSelect, X_test[select_feat_lasso]).mean()
r2adj_cv_tr = r2adj_cv_train(lr_lasSelect, X_train[select_feat_lasso]).mean()
r2adj_cv_tst = r2adj_cv_test(lr_lasSelect, X_test[select_feat_lasso]).mean()
results_lassfeat = pd.DataFrame({'LR LassoFeat Select':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst}})
results_lassfeat
| LR LassoFeat Select | |
|---|---|
| AdjR2 CV Test | 0.92498 |
| AdjR2 CV Train | 0.92647 |
| MAE CV Test | 0.06953 |
| MAE CV Train | 0.07184 |
| RMSE CV Test | 0.09428 |
| RMSE CV Train | 0.10034 |
pre_results = pd.concat([pre_results, results_lassfeat], axis=1)
y_train_lasF = lr_lasSelect.predict(X_train[select_feat_lasso])
y_test_lasF = lr_lasSelect.predict(X_test[select_feat_lasso])
plt.scatter(y_train_lasF, y_train_lasF - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_test_lasF, y_test_lasF - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Linear Regression with Lasso regularization')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10, xmax=14, color='purple')
plt.show()
plt.scatter(y_train_lasF, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_test_lasF, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Linear Regression w/ Lasso Regularization')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
Here then are all of the preliminary results
pre_results
| Linear Regression | LR BasicFeatures | LR RForestFeat Select | LR LassoFeat Select | |
|---|---|---|---|---|
| AdjR2 CV Test | -1126930666536769945600.00000 | 0.88048 | 0.90745 | 0.92498 |
| AdjR2 CV Train | -4758674034797168640.00000 | 0.88951 | 0.90728 | 0.92647 |
| MAE CV Test | 658335239.48019 | 0.08844 | 0.07883 | 0.06953 |
| MAE CV Train | 21855952.03904 | 0.09001 | 0.08188 | 0.07184 |
| RMSE CV Test | 5750750202.38194 | 0.11922 | 0.10443 | 0.09428 |
| RMSE CV Train | 300810559.93264 | 0.12347 | 0.11277 | 0.10034 |
# pre_results.to_excel(f'pre_results.xlsx')
We now move on to the main models. For all models, I set a K number for cross validation and a variable so as to easily try different feature sets. Note: the commented out bit is a variety of other feature selections that I tried out.
#all features X_train.columns
#Feature Select X_train[Feat_Select]
#Feature Select w/out FeatAdjust X_train[Feat_Select_N]
#RF Selected Features X_train[select_feat_rf]
#Lasso Selected Features X_train[select_feat_lasso]
#Basic Feature X_train[basic_features]
featset_train = X_train[Feat_Select]
featset_test = X_test[Feat_Select]
cV=10
I run the model once using the timeit function which the average time of 7 runs.
%%capture results
%%timeit
model_lr = LinearRegression()
model_lr.fit(featset_train, y_train)
time = results.stdout
Next I implement the model followed by the CV scorers.
model_lr = LinearRegression()
model_lr.fit(featset_train, y_train)
LinearRegression()
rmse_cv_tr = rmse_cv_train(model_lr, featset_train, cV).mean()
rmse_cv_tst = rmse_cv_test(model_lr, featset_test, cV).mean()
mae_cv_tr = mae_cv_train(model_lr, featset_train, cV).mean()
mae_cv_tst = mae_cv_test(model_lr, featset_test, cV).mean()
r2adj_cv_tr = r2adj_cv_train(model_lr,featset_train, cv=cV).mean()
r2adj_cv_tst = r2adj_cv_test(model_lr, featset_test,cv=cV).mean()
Set up predictions for entire train and test set for purpose of charting.
y_train_lr = model_lr.predict(featset_train)
y_test_lr = model_lr.predict(featset_test)
We can see that the plots are very similar to the preliminary regression models.
plt.scatter(y_train_lr, y_train_lr - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_test_lr, y_test_lr - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Multiple Linear Regression Model')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10.5, xmax=13.5, color='purple')
plt.show()
plt.scatter(y_train_lr, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_test_lr, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Multiple Linear Regression Model')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
The results are very impressive. The only issue with the multiple linear regression model is that with some of the larger feature sets the model breaks (as it did with all features included)
result = pd.DataFrame({'Linear Regression':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst, 'Time':time}})
result
| Linear Regression | |
|---|---|
| AdjR2 CV Test | 0.90451 |
| AdjR2 CV Train | 0.90538 |
| MAE CV Test | 0.08071 |
| MAE CV Train | 0.08315 |
| RMSE CV Test | 0.10596 |
| RMSE CV Train | 0.11384 |
| Time | 1.57 ms +- 18.7 us per loop (mean +- std. dev.... |
Here, I implement the Lasso Regressor. While for other models I utilized RandomizedSearchCV in order to test different varaibles, with Lasso model I just have to test the alpha score which can simply be done with a test run of the Lasso model which itself is a cv model.
#Linear Regression with Lasso regularization
lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1], max_iter = 50000, cv = 10)
lasso.fit(featset_train, y_train)
alpha = lasso.alpha_
print('Best alpha: ', alpha)
Best alpha: 0.0001
I check the time it takes to run the Lasso model.
%%capture results
%%timeit
lasso_A = LassoCV(alphas = [alpha*0.6, alpha * 0.65, alpha * 0.7, alpha * 0.75, alpha * 0.8, alpha * 0.9, alpha * 0.95, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], max_iter=50000, cv = 10)
lasso_A.fit(featset_train, y_train)
time=results.stdout
Run the Lasso model followed by CV scorers
lasso_A = LassoCV(alphas = [alpha*0.6, alpha * 0.65, alpha * 0.7, alpha * 0.75, alpha * 0.8, alpha * 0.9, alpha * 0.95, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4], max_iter=50000, cv = 10)
lasso_A.fit(featset_train, y_train)
print('Best alpha: ', alpha)
Best alpha: 0.0001
rmse_cv_tr = rmse_cv_train(lasso_A, featset_train, cV).mean()
rmse_cv_tst = rmse_cv_test(lasso_A, featset_test, cV).mean()
mae_cv_tr = mae_cv_train(lasso_A, featset_train, cV).mean()
mae_cv_tst = mae_cv_test(lasso_A, featset_test, cV).mean()
r2adj_cv_tr = r2adj_cv_train(lasso_A, featset_train, cv=cV).mean()
r2adj_cv_tst = r2adj_cv_test(lasso_A, featset_test,cv=cV).mean()
y_train_las = lasso_A.predict(featset_train)
y_test_las = lasso_A.predict(featset_test)
The residuals plot looks fine.
plt.scatter(y_train_las, y_train_las - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_test_las, y_test_las - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Lasso Regression')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10, xmax=14, color='purple')
plt.show()
plt.scatter(y_train_las, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_test_las, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Lasso Regression')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
I thought I would check to see what features the model was using.
coeffs = pd.Series(lasso_A.coef_, index=featset_train.columns)
plt.figure(figsize=(12,9))
key_coeffs = pd.concat([coeffs.sort_values().head(12), coeffs.sort_values().tail(12)])
key_coeffs.plot(kind='barh')
plt.title('Coefficients for Lasso Regression')
plt.show()
The results are very good though just a little bit better than plain Multiple Linear Regression.
results_lasso = pd.DataFrame({'Lasso Regression':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst, 'Time':time}})
results_lasso
| Lasso Regression | |
|---|---|
| AdjR2 CV Test | 0.90508 |
| AdjR2 CV Train | 0.90540 |
| MAE CV Test | 0.08035 |
| MAE CV Train | 0.08304 |
| RMSE CV Test | 0.10559 |
| RMSE CV Train | 0.11382 |
| Time | 24.8 ms +- 1.58 ms per loop (mean +- std. dev.... |
result = pd.concat([result, results_lasso], axis=1)
Starting with the Decision Tree I use RandomizedSearchCV to narrow down the optimal parameters after a bit of trial and error.
opti_params = {'criterion':['squared_error'],
'splitter':['best'],
'max_depth': [None,5,50,65,75,100],
'max_features':['auto', 'sqrt','log2'],
'max_leaf_nodes':[None,150,175,250,500,750],
'min_samples_leaf':[5,6,7,8,9,10],
'min_weight_fraction_leaf':[0.0],
}
opti_model_decTree = DecisionTreeRegressor(random_state=0)
opti_decTree = RandomizedSearchCV(estimator=opti_model_decTree, param_distributions=opti_params, scoring='neg_mean_squared_error', cv=5, n_iter=100, verbose=1, error_score='raise')
opti_decTree.fit(featset_train, y_train)
Fitting 5 folds for each of 100 candidates, totalling 500 fits
RandomizedSearchCV(cv=5, error_score='raise',
estimator=DecisionTreeRegressor(random_state=0), n_iter=100,
param_distributions={'criterion': ['squared_error'],
'max_depth': [None, 5, 50, 65, 75, 100],
'max_features': ['auto', 'sqrt',
'log2'],
'max_leaf_nodes': [None, 150, 175, 250,
500, 750],
'min_samples_leaf': [5, 6, 7, 8, 9, 10],
'min_weight_fraction_leaf': [0.0],
'splitter': ['best']},
scoring='neg_mean_squared_error', verbose=1)
print('Best Params: ', opti_decTree.best_params_)
print('Lowest RMSE: ', np.sqrt(-opti_decTree.best_score_))
Best Params: {'splitter': 'best', 'min_weight_fraction_leaf': 0.0, 'min_samples_leaf': 10, 'max_leaf_nodes': 500, 'max_features': 'auto', 'max_depth': 75, 'criterion': 'squared_error'}
Lowest RMSE: 0.15870239514520515
I then visualize the loss function to see how adding levels to the tree depth works in relation to MSE.
rmse = []
for depth in range(1, 100):
tree_reg=DecisionTreeRegressor(random_state=0, max_depth=depth)
tree_reg.fit(featset_train, y_train)
tree_predictions=tree_reg.predict(featset_test)
rmse.append(np.sqrt(mean_squared_error(y_test, tree_predictions)))
tree_depths = [depth for depth in range(1,100)]
plt.figure(figsize=(11,7))
plt.title('Decision Tree Regressor Loss')
plt.grid()
plt.plot(tree_depths,rmse)
plt.xlabel('Tree Depth')
plt.ylabel('Root Mean Square Error')
plt.show()
After the preliminary test is over, I time the final model and then implement it again for the various cross validation scorers.
%%capture results
%%timeit
model_decTree = DecisionTreeRegressor(random_state=0, min_samples_leaf=5, max_leaf_nodes=175, max_features='auto', max_depth=75, criterion='squared_error')
model_decTree.fit(featset_train, y_train)
time=results.stdout
model_decTree = DecisionTreeRegressor(random_state=0, max_depth=8, min_samples_leaf=10,max_leaf_nodes=500,criterion='squared_error')
model_decTree.fit(featset_train, y_train)
DecisionTreeRegressor(max_depth=8, max_leaf_nodes=500, min_samples_leaf=10,
random_state=0)
rmse_cv_tr = rmse_cv_train(model_decTree, featset_train, cV).mean()
rmse_cv_tst = rmse_cv_test(model_decTree, featset_test, cV).mean()
mae_cv_tr = mae_cv_train(model_decTree, featset_train, cV).mean()
mae_cv_tst = mae_cv_test(model_decTree, featset_test, cV).mean()
r2adj_cv_tr = r2adj_cv_train(model_decTree, featset_train, cv=cV).mean()
r2adj_cv_tst = r2adj_cv_test(model_decTree, featset_test,cv=cV).mean()
Here I visualize the entire 8 levels of the regression tree. The resulting chart is really big and would be difficult to show in a report or presentation so I created a representation of the first 3 levels to give an idea of how the tree model proceeded.
dot_data = export_graphviz(model_decTree, out_file=None, feature_names=featset_train.columns, filled=True)#, max_depth=3)
vizTree = graphviz.Source(dot_data, format='png')
vizTree
#vizTree.render()
y_prTrain_dt = model_decTree.predict(featset_train)
y_prTest_dt = model_decTree.predict(featset_test)
The residuals seem a bit more spread out but I think it is still okay.
plt.scatter(y_prTrain_dt, y_prTrain_dt - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_prTest_dt, y_prTest_dt - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Decision Tree Regressor')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10, xmax=14, color='purple')
plt.show()
This line chart struck me as a bit funny. The points are all really concentrated in area but that concentration is quite fat.
plt.scatter(y_prTrain_dt, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_prTest_dt, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Decision Tree Regressor')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
The results for the Regression Tree are disappointing in comparison with the other models.
results_decTree = pd.DataFrame({'Decision Tree':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst, 'Time':time}})
results_decTree
| Decision Tree | |
|---|---|
| AdjR2 CV Test | 0.75966 |
| AdjR2 CV Train | 0.82707 |
| MAE CV Test | 0.12676 |
| MAE CV Train | 0.11246 |
| RMSE CV Test | 0.16945 |
| RMSE CV Train | 0.15489 |
| Time | 7.93 ms +- 98.4 us per loop (mean +- std. dev.... |
result = pd.concat([result, results_decTree], axis=1)
I follow much the same procedure here beginning with trying out a number of different parameters.
opti_params = {'criterion':['squared_error'],
'n_estimators':[400,500,1000],
'max_depth': [2,50,85],
'max_features':['sqrt','auto'],
'min_samples_split':[2,3,4,5],
'min_samples_leaf':[1,2,3,4]
}
opti_model_rf = RandomForestRegressor()
opti_rf = RandomizedSearchCV(estimator=opti_model_rf, param_distributions=opti_params, scoring='neg_mean_squared_error', cv=5, n_iter=100, verbose=1, n_jobs=-1, error_score='raise')
opti_rf.fit(featset_train, y_train)
Fitting 5 folds for each of 100 candidates, totalling 500 fits
RandomizedSearchCV(cv=5, error_score='raise', estimator=RandomForestRegressor(),
n_iter=100, n_jobs=-1,
param_distributions={'criterion': ['squared_error'],
'max_depth': [2, 50, 85],
'max_features': ['sqrt', 'auto'],
'min_samples_leaf': [1, 2, 3, 4],
'min_samples_split': [2, 3, 4, 5],
'n_estimators': [400, 500, 1000]},
scoring='neg_mean_squared_error', verbose=1)
print('Best Params: ', opti_rf.best_params_)
print('Lowest RMSE: ', np.sqrt(-opti_rf.best_score_))
Best Params: {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 85, 'criterion': 'squared_error'}
Lowest RMSE: 0.12271954503137808
Again I visualize how the model progresses. Note that here I am using an Out of Bag Error rate.
rf_model = RandomForestRegressor(criterion='mse', random_state=0, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', max_depth=50, n_jobs=-1, oob_score=True)
rf_model.fit(X_train, y_train)
min_estimators=100
max_estimators=1000
error_rate= {'label':[]}
for i in range(min_estimators, max_estimators+1, 5):
rf_model.set_params(n_estimators=i)
rf_model.fit(featset_train,y_train)
oob_error = 1-rf_model.oob_score_
error_rate['label'].append((i,oob_error))
plt.figure(figsize=(11,7))
for label, reg_error in error_rate.items():
xs,ys=zip(*reg_error)
plt.plot(xs,ys,label=label)
plt.title('Random Forest Out of Bag Error Estimate')
plt.xlim(min_estimators, max_estimators)
plt.xlabel('Number of Estimators')
plt.ylabel('OOB error rate')
plt.show()
Check the time for the random forest model
%%capture results
%%timeit
model_rf = RandomForestRegressor(criterion='squared_error', n_estimators=315, max_depth=50, random_state=0, min_samples_split=4, min_samples_leaf=1, max_features='sqrt', n_jobs=-1)
model_rf.fit(featset_train, y_train)
time= results.stdout
Next I implement the model and run the CV scorers.
model_rf = RandomForestRegressor(criterion='mse', n_estimators=315, max_depth=50, random_state=0, min_samples_split=2, min_samples_leaf=1, max_features='sqrt', n_jobs=-1)
model_rf.fit(featset_train, y_train)
RandomForestRegressor(criterion='mse', max_depth=50, max_features='sqrt',
n_estimators=315, n_jobs=-1, random_state=0)
rmse_cv_tr = rmse_cv_train(model_rf, featset_train, cV).mean()
rmse_cv_tst = rmse_cv_test(model_rf, featset_test, cV).mean()
mae_cv_tr = mae_cv_train(model_rf, featset_train, cV).mean()
mae_cv_tst = mae_cv_test(model_rf, featset_test, cV).mean()
r2adj_cv_tr = r2adj_cv_train(model_rf, featset_train, cv=cV).mean()
r2adj_cv_tst = r2adj_cv_test(model_rf, featset_test,cv=cV).mean()
Here I visualized the last of the forest's tree predictors. This is an even bigger tree with a depth of 50. It is quite different than the Regression Tree model which makes sense as the point of the random forest is to create a large number of different trees to capture (reduce) variation in the dataset.
dot_data = export_graphviz(model_rf.estimators_[314], out_file=None, feature_names=featset_train.columns, filled=True)#, max_depth=3)
vizTree = graphviz.Source(dot_data, format='png')
vizTree
#vizTree.render()
I just wanted to check to see the max depth for the tree above.
model_rf.estimators_[314].max_depth
50
y_prTrain_rf = model_rf.predict(featset_train)
y_prTest_rf = model_rf.predict(featset_test)
Again the residuals seem a little funny but I don't believe there is any clear pattern.
plt.scatter(y_prTrain_rf, y_prTrain_rf - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_prTest_rf, y_prTest_rf - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Random Forest Regressor')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10, xmax=14, color='purple')
plt.show()
The data points are tighter for the random forest and not quite as fat.
plt.scatter(y_prTrain_rf, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_prTest_rf, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Random Forest Regressor')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
Fairly decent results although random forest doesn't perform quite as well as plain linear regression and requires far more time.
results_rForest = pd.DataFrame({'Random Forest':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst, 'Time':time}})
results_rForest
| Random Forest | |
|---|---|
| AdjR2 CV Test | 0.87241 |
| AdjR2 CV Train | 0.89348 |
| MAE CV Test | 0.08855 |
| MAE CV Train | 0.08594 |
| RMSE CV Test | 0.12412 |
| RMSE CV Train | 0.12111 |
| Time | 242 ms +- 8.09 ms per loop (mean +- std. dev. ... |
result = pd.concat([result,results_rForest], axis=1)
I decided to use XGBoost instead of the scikit-learn's gradient boosting algorithm because it apparently runs faster, is more memory efficient and offers better predictive performance. Conveniently XGBoost does have a sklearn wrapper.
With the XGBoost, I had some trouble when I tried to chart a more developed model. As such, I reversed my normal procedure and start by charting the model's loss function. We can see the the rmse score declining as the model runs through its training iterations.
test_model = xgb.XGBRegressor(random_state=0, n_jobs=-6)
evalset = [(featset_train,y_train), (featset_test,y_test)]
test_model.fit(featset_train, y_train, eval_metric='rmse', eval_set=evalset)
yhat=test_model.predict(featset_test)
score=mean_squared_error(y_test, yhat)
res = test_model.evals_result()
[0] validation_0-rmse:8.07923 validation_1-rmse:8.06223 [1] validation_0-rmse:5.66062 validation_1-rmse:5.64633 [2] validation_0-rmse:3.96769 validation_1-rmse:3.95455 [3] validation_0-rmse:2.78276 validation_1-rmse:2.77111 [4] validation_0-rmse:1.95357 validation_1-rmse:1.94489 [5] validation_0-rmse:1.37376 validation_1-rmse:1.36529 [6] validation_0-rmse:0.96881 validation_1-rmse:0.96115 [7] validation_0-rmse:0.68635 validation_1-rmse:0.67978 [8] validation_0-rmse:0.48986 validation_1-rmse:0.48597 [9] validation_0-rmse:0.35407 validation_1-rmse:0.35253 [10] validation_0-rmse:0.26090 validation_1-rmse:0.26239 [11] validation_0-rmse:0.19685 validation_1-rmse:0.20324 [12] validation_0-rmse:0.15466 validation_1-rmse:0.16596 [13] validation_0-rmse:0.12782 validation_1-rmse:0.14410 [14] validation_0-rmse:0.11081 validation_1-rmse:0.13257 [15] validation_0-rmse:0.10070 validation_1-rmse:0.12606 [16] validation_0-rmse:0.09279 validation_1-rmse:0.12310 [17] validation_0-rmse:0.08685 validation_1-rmse:0.12052 [18] validation_0-rmse:0.08394 validation_1-rmse:0.11953 [19] validation_0-rmse:0.08127 validation_1-rmse:0.11923 [20] validation_0-rmse:0.07856 validation_1-rmse:0.11889 [21] validation_0-rmse:0.07612 validation_1-rmse:0.11883 [22] validation_0-rmse:0.07299 validation_1-rmse:0.11865 [23] validation_0-rmse:0.07056 validation_1-rmse:0.11805 [24] validation_0-rmse:0.06889 validation_1-rmse:0.11761 [25] validation_0-rmse:0.06706 validation_1-rmse:0.11761 [26] validation_0-rmse:0.06611 validation_1-rmse:0.11741 [27] validation_0-rmse:0.06510 validation_1-rmse:0.11727 [28] validation_0-rmse:0.06410 validation_1-rmse:0.11697 [29] validation_0-rmse:0.06226 validation_1-rmse:0.11658 [30] validation_0-rmse:0.06138 validation_1-rmse:0.11650 [31] validation_0-rmse:0.06035 validation_1-rmse:0.11631 [32] validation_0-rmse:0.05976 validation_1-rmse:0.11615 [33] validation_0-rmse:0.05817 validation_1-rmse:0.11576 [34] validation_0-rmse:0.05769 validation_1-rmse:0.11552 [35] validation_0-rmse:0.05618 validation_1-rmse:0.11524 [36] validation_0-rmse:0.05466 validation_1-rmse:0.11523 [37] validation_0-rmse:0.05418 validation_1-rmse:0.11534 [38] validation_0-rmse:0.05312 validation_1-rmse:0.11539 [39] validation_0-rmse:0.05260 validation_1-rmse:0.11537 [40] validation_0-rmse:0.05196 validation_1-rmse:0.11543 [41] validation_0-rmse:0.05110 validation_1-rmse:0.11528 [42] validation_0-rmse:0.05064 validation_1-rmse:0.11526 [43] validation_0-rmse:0.04930 validation_1-rmse:0.11519 [44] validation_0-rmse:0.04819 validation_1-rmse:0.11519 [45] validation_0-rmse:0.04749 validation_1-rmse:0.11519 [46] validation_0-rmse:0.04667 validation_1-rmse:0.11525 [47] validation_0-rmse:0.04581 validation_1-rmse:0.11534 [48] validation_0-rmse:0.04547 validation_1-rmse:0.11548 [49] validation_0-rmse:0.04465 validation_1-rmse:0.11560 [50] validation_0-rmse:0.04446 validation_1-rmse:0.11576 [51] validation_0-rmse:0.04400 validation_1-rmse:0.11589 [52] validation_0-rmse:0.04359 validation_1-rmse:0.11570 [53] validation_0-rmse:0.04283 validation_1-rmse:0.11573 [54] validation_0-rmse:0.04188 validation_1-rmse:0.11578 [55] validation_0-rmse:0.04133 validation_1-rmse:0.11587 [56] validation_0-rmse:0.04064 validation_1-rmse:0.11591 [57] validation_0-rmse:0.03993 validation_1-rmse:0.11607 [58] validation_0-rmse:0.03890 validation_1-rmse:0.11622 [59] validation_0-rmse:0.03866 validation_1-rmse:0.11636 [60] validation_0-rmse:0.03834 validation_1-rmse:0.11637 [61] validation_0-rmse:0.03768 validation_1-rmse:0.11620 [62] validation_0-rmse:0.03716 validation_1-rmse:0.11610 [63] validation_0-rmse:0.03668 validation_1-rmse:0.11597 [64] validation_0-rmse:0.03589 validation_1-rmse:0.11595 [65] validation_0-rmse:0.03547 validation_1-rmse:0.11585 [66] validation_0-rmse:0.03522 validation_1-rmse:0.11593 [67] validation_0-rmse:0.03476 validation_1-rmse:0.11592 [68] validation_0-rmse:0.03423 validation_1-rmse:0.11596 [69] validation_0-rmse:0.03386 validation_1-rmse:0.11593 [70] validation_0-rmse:0.03357 validation_1-rmse:0.11605 [71] validation_0-rmse:0.03306 validation_1-rmse:0.11594 [72] validation_0-rmse:0.03274 validation_1-rmse:0.11596 [73] validation_0-rmse:0.03195 validation_1-rmse:0.11595 [74] validation_0-rmse:0.03138 validation_1-rmse:0.11588 [75] validation_0-rmse:0.03122 validation_1-rmse:0.11580 [76] validation_0-rmse:0.03068 validation_1-rmse:0.11589 [77] validation_0-rmse:0.03039 validation_1-rmse:0.11588 [78] validation_0-rmse:0.02988 validation_1-rmse:0.11590 [79] validation_0-rmse:0.02958 validation_1-rmse:0.11593 [80] validation_0-rmse:0.02946 validation_1-rmse:0.11599 [81] validation_0-rmse:0.02904 validation_1-rmse:0.11604 [82] validation_0-rmse:0.02849 validation_1-rmse:0.11598 [83] validation_0-rmse:0.02840 validation_1-rmse:0.11597 [84] validation_0-rmse:0.02783 validation_1-rmse:0.11596 [85] validation_0-rmse:0.02711 validation_1-rmse:0.11596 [86] validation_0-rmse:0.02702 validation_1-rmse:0.11601 [87] validation_0-rmse:0.02662 validation_1-rmse:0.11603 [88] validation_0-rmse:0.02607 validation_1-rmse:0.11606 [89] validation_0-rmse:0.02570 validation_1-rmse:0.11611 [90] validation_0-rmse:0.02548 validation_1-rmse:0.11606 [91] validation_0-rmse:0.02520 validation_1-rmse:0.11608 [92] validation_0-rmse:0.02466 validation_1-rmse:0.11610 [93] validation_0-rmse:0.02447 validation_1-rmse:0.11609 [94] validation_0-rmse:0.02417 validation_1-rmse:0.11617 [95] validation_0-rmse:0.02406 validation_1-rmse:0.11614 [96] validation_0-rmse:0.02394 validation_1-rmse:0.11612 [97] validation_0-rmse:0.02366 validation_1-rmse:0.11616 [98] validation_0-rmse:0.02312 validation_1-rmse:0.11617 [99] validation_0-rmse:0.02307 validation_1-rmse:0.11622
plt.figure(figsize=(11,7))
plt.title('XGBoost Loss')
plt.plot(res['validation_0']['rmse'], label='train')
plt.plot(res['validation_1']['rmse'], label='test')
plt.ylim(0.001, 1)
plt.ylabel('Root Mean Squared Error')
plt.xlabel('Number of Estimators')
plt.legend()
plt.show()
opti_params = {'n_estimators':[20,50,100],
'max_depth': [2,3,5,6,7,10,15],
'learning_rate':[0.1,0.15,0.300],
'min_child_weight':[4,5,6,7,8,9],
'reg_alpha':[0.1, 0.01, 0.001,0.0001],
'colsample_bytree=1':[2]
}
opti_model_xgb = xgb.XGBRegressor(random_state=42)
opti_xgb = RandomizedSearchCV(estimator=opti_model_xgb, param_distributions=opti_params, scoring='neg_mean_squared_error', cv=5, n_iter=100, n_jobs=-1)
opti_xgb.fit(featset_train, y_train, early_stopping_rounds=5, eval_set=[(featset_test, y_test)])
[19:48:22] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:576:
Parameters: { "colsample_bytree=1" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
[0] validation_0-rmse:10.36208
[1] validation_0-rmse:9.32666
[2] validation_0-rmse:8.39505
[3] validation_0-rmse:7.55590
[4] validation_0-rmse:6.80103
[5] validation_0-rmse:6.12128
[6] validation_0-rmse:5.51034
[7] validation_0-rmse:4.95932
[8] validation_0-rmse:4.46384
[9] validation_0-rmse:4.01845
[10] validation_0-rmse:3.61764
[11] validation_0-rmse:3.25663
[12] validation_0-rmse:2.93153
[13] validation_0-rmse:2.63981
[14] validation_0-rmse:2.37688
[15] validation_0-rmse:2.14036
[16] validation_0-rmse:1.92731
[17] validation_0-rmse:1.73574
[18] validation_0-rmse:1.56356
[19] validation_0-rmse:1.40842
[20] validation_0-rmse:1.26837
[21] validation_0-rmse:1.14279
[22] validation_0-rmse:1.02977
[23] validation_0-rmse:0.92844
[24] validation_0-rmse:0.83718
[25] validation_0-rmse:0.75545
[26] validation_0-rmse:0.68190
[27] validation_0-rmse:0.61557
[28] validation_0-rmse:0.55637
[29] validation_0-rmse:0.50361
[30] validation_0-rmse:0.45584
[31] validation_0-rmse:0.41359
[32] validation_0-rmse:0.37528
[33] validation_0-rmse:0.34128
[34] validation_0-rmse:0.31132
[35] validation_0-rmse:0.28448
[36] validation_0-rmse:0.26104
[37] validation_0-rmse:0.23975
[38] validation_0-rmse:0.22121
[39] validation_0-rmse:0.20478
[40] validation_0-rmse:0.19108
[41] validation_0-rmse:0.17890
[42] validation_0-rmse:0.16828
[43] validation_0-rmse:0.15909
[44] validation_0-rmse:0.15111
[45] validation_0-rmse:0.14432
[46] validation_0-rmse:0.13870
[47] validation_0-rmse:0.13362
[48] validation_0-rmse:0.12963
[49] validation_0-rmse:0.12629
[50] validation_0-rmse:0.12363
[51] validation_0-rmse:0.12124
[52] validation_0-rmse:0.11902
[53] validation_0-rmse:0.11749
[54] validation_0-rmse:0.11610
[55] validation_0-rmse:0.11485
[56] validation_0-rmse:0.11389
[57] validation_0-rmse:0.11315
[58] validation_0-rmse:0.11246
[59] validation_0-rmse:0.11169
[60] validation_0-rmse:0.11134
[61] validation_0-rmse:0.11082
[62] validation_0-rmse:0.11053
[63] validation_0-rmse:0.11030
[64] validation_0-rmse:0.11011
[65] validation_0-rmse:0.10992
[66] validation_0-rmse:0.10958
[67] validation_0-rmse:0.10932
[68] validation_0-rmse:0.10916
[69] validation_0-rmse:0.10913
[70] validation_0-rmse:0.10932
[71] validation_0-rmse:0.10919
[72] validation_0-rmse:0.10919
[73] validation_0-rmse:0.10917
[74] validation_0-rmse:0.10926
RandomizedSearchCV(cv=5,
estimator=XGBRegressor(base_score=None, booster=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
enable_categorical=False, gamma=None,
gpu_id=None, importance_type=None,
interaction_constraints=None,
learning_rate=None,
max_delta_step=None, max_depth=None,
min_child_weight=None, missing=nan,
monotone_constraints=...
scale_pos_weight=None, subsample=None,
tree_method=None,
validate_parameters=None,
verbosity=None),
n_iter=100, n_jobs=-1,
param_distributions={'colsample_bytree=1': [2],
'learning_rate': [0.1, 0.15, 0.3],
'max_depth': [2, 3, 5, 6, 7, 10, 15],
'min_child_weight': [4, 5, 6, 7, 8, 9],
'n_estimators': [20, 50, 100],
'reg_alpha': [0.1, 0.01, 0.001,
0.0001]},
scoring='neg_mean_squared_error')
print('Best Params: ', opti_xgb.best_params_)
print('Lowest RMSE: ', np.sqrt(-opti_xgb.best_score_))
Best Params: {'reg_alpha': 0.0001, 'n_estimators': 100, 'min_child_weight': 8, 'max_depth': 15, 'learning_rate': 0.1, 'colsample_bytree=1': 2}
Lowest RMSE: 0.11557831728945535
I decided that I would go with a larger n estimators size than the loss function suggested. In trials the results were a bit better.
%%capture results
%%timeit
model_xgb = xgb.XGBRegressor(n_estimators=100, min_child_weight=8, max_depth=8, colsample_bytree=1, learning_rate=0.15, reg_alpha=0.0001, n_jobs=-1, random_state=0)
model_xgb.fit(featset_train,y_train)
time = results.stdout
After the timing the model, I implement it again along the CV scorers.
model_xgb = model_xgb = xgb.XGBRegressor(n_estimators=100, min_child_weight=8, max_depth=5, colsample_bytree= 1, learning_rate=0.15, reg_alpha=0.0001, n_jobs=-1, random_state=0)
model_xgb.fit(featset_train,y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.15, max_delta_step=0,
max_depth=5, min_child_weight=8, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=-1,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0.0001, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
rmse_cv_tr = rmse_cv_train(model_xgb, featset_train, cV).mean()
rmse_cv_tst = rmse_cv_test(model_xgb, featset_test, cV).mean()
mae_cv_tr = mae_cv_train(model_xgb, featset_train, cV).mean()
mae_cv_tst = mae_cv_test(model_xgb, featset_test, cV).mean()
r2adj_cv_tr = r2adj_cv_train(model_xgb, featset_train, cv=cV).mean()
r2adj_cv_tst = r2adj_cv_test(model_xgb, featset_test,cv=cV).mean()
y_prTrain_xgb = model_xgb.predict(featset_train)
y_prTest_xgb = model_xgb.predict(featset_test)
The residuals plot looks better than the one for random forest.
plt.scatter(y_prTrain_xgb, y_prTrain_xgb - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_prTest_xgb, y_prTest_xgb - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('XGBoost')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10, xmax=14, color='purple')
plt.show()
Appearance wise this looks pretty similar to the random forest fitted line chart.
plt.scatter(y_prTrain_xgb, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_prTest_xgb, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('XGBoost')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
Quite good results that are right up there with Lasso and Multiple Regression in terms of effectiveness. Of course, the model does take a bit more time but is still quite efficient for an ensemble model.
results_xgboost = pd.DataFrame({'XGBoost':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst, 'Time':time}})
results_xgboost
| XGBoost | |
|---|---|
| AdjR2 CV Test | 0.89248 |
| AdjR2 CV Train | 0.90865 |
| MAE CV Test | 0.08368 |
| MAE CV Train | 0.08027 |
| RMSE CV Test | 0.11206 |
| RMSE CV Train | 0.11170 |
| Time | 113 ms +- 3.37 ms per loop (mean +- std. dev. ... |
result = pd.concat([result, results_xgboost], axis=1)
tf.config.list_physical_devices(device_type=None)
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
I utilize the GPU in order to speed up caculations.
tf.device('/GPU:0')
<tensorflow.python.eager.context._EagerDeviceContext at 0x1da00b0f200>
Here I define a basic model and then experiment with RandomizedSearchCV to find optimal parameters. Given the time required to run the models this took a while.
def base_model(neurons=1):
base_model = Sequential()
base_model.add(Dense(neurons, input_dim=featset_train.shape[1], kernel_initializer='normal', activation='relu'))
#base_model.add(Dense(neurons, activation='relu'))
base_model.add(Dense(1, kernel_initializer='normal'))
base_model.compile(loss='mean_squared_error', optimizer='adam')
return base_model
Note: For most models I have left the RandomizedSearchCV code but for the neural network I commented it out as it takes so long.
# opti_params = {'neurons':[85],
# 'epochs':[100,250,500],
# 'batch_size':[25,50]}
# opti_model_ann = KerasRegressor(build_fn=base_model, verbose=0)
# opti_ann = RandomizedSearchCV(estimator=opti_model_ann, param_distributions=opti_params, pre_dispatch=1, n_jobs=-4, n_iter=100, cv=5)
# opti_ann_result = opti_ann.fit(featset_train, y_train)
# print("Best: %f using %s" % (np.sqrt(-opti_ann_result.best_score_), opti_ann_result.best_params_))
# means = np.sqrt(-opti_ann_result.cv_results_['mean_test_score'])
# #stds = opti_ann_result.cv_results_['std_test_score']
# params = opti_ann_result.cv_results_['params']
# for mean, param in zip(means,params):
# print("%f with: %r" % (mean,param))
I was having problems with my video card still tying up all its memory after I ran a model. This helped release the memory.
# tf.keras.backend.clear_session()
After trying to figure out what setting might work best, I implement my model. While I did try a model with 2 hidden layers, the results with a single layer of 85 nodes worked the best. Note I am using the sklearn regressor wrapper for Keras.
def ann_model(neurons=1):
base_model = Sequential()
base_model.add(Dense(85, input_dim=featset_train.shape[1], kernel_initializer='normal', activation='relu'))
#base_model.add(Dense(neurons, activation='relu'))
base_model.add(Dense(1, kernel_initializer='normal'))
base_model.compile(loss='mean_squared_error', optimizer='adam')
return base_model
model_ann = KerasRegressor(build_fn=ann_model, epochs = 100, batch_size=25, verbose=0)
history = model_ann.fit(featset_train, y_train, validation_data=(featset_test,y_test), epochs=200, batch_size=20, verbose=0)
print(history.history.keys())
dict_keys(['loss', 'val_loss'])
Here I chart the loss function for the ANN. In contrast to the other models, this was not so helpful. The chart suggests that the RMSE should decline below 0.025 for both train and test data but I could not get anywhere close to those results with my CV scorers.
plt.figure(figsize=(11,7))
plt.title('Neural Network Loss')
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylim(0.00001,0.20)
plt.title('Neural Network Loss')
plt.ylabel('Root Mean Squared Error')
plt.xlabel('epoch')
Text(0.5, 0, 'epoch')
# model_ann = KerasRegressor(build_fn=ann_model, epochs = 25, batch_size=10, verbose=0)
In practive I found that a model with 100 epochs and a batch size of 25 produced the best results.
%%capture results
%%timeit
model_ann.fit(featset_train, y_train)
time=results.stdout
model_ann.fit(featset_train, y_train)
<keras.callbacks.History at 0x1da767c51c0>
Here I created a very basic visualization of the ANN model. It really doesn't tell us very much. I need to do some future work to find better ways of representing neural network models. I believe that Tensor Board might be the way to go but I need to do some further work to figure it out.
viz_mod = Sequential([
Dense(85, activation='relu', input_shape=(40,)),
Dense(1)
])
visualizer(viz_mod, filename='ann_model', format='png', view=True)
rmse_cv_tr = rmse_cv_train(model_ann, featset_train, cV, nj=-4, pred=1).mean()
rmse_cv_tst = rmse_cv_test(model_ann, featset_test, cV, nj=-4, pred=1).mean()
mae_cv_tr = mae_cv_train(model_ann, featset_train, cV, nj=-4, pred=1).mean()
mae_cv_tst = mae_cv_test(model_ann, featset_test, cV, nj=-4, pred=1).mean()
r2adj_cv_tr = r2adj_cv_train(model_ann, featset_train, cv=cV, nj=-4, pred=1).mean()
r2adj_cv_tst = r2adj_cv_test(model_ann, featset_test,cv=cV, nj=-4, pred=1).mean()
y_prTrain_ann = model_ann.predict(featset_train)
y_prTest_ann = model_ann.predict(featset_test)
This residuals plot is concerning as the center of the distribution is a fair bit higher than the mean.
plt.scatter(y_prTrain_ann, y_prTrain_ann - y_train, c='green', marker='.', label= 'Training Data')
plt.scatter(y_prTest_ann, y_prTest_ann - y_test, c='orange', marker='.', label = 'Test Data')
plt.title('Neural Network')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y = 0, xmin=10, xmax=14, color='purple')
plt.show()
plt.scatter(y_prTrain_ann, y_train, c = 'blue', marker='.', label = 'Training data')
plt.scatter(y_prTest_ann, y_test, c = 'magenta', marker='.', label = 'Test Data')
plt.title('Neural Network')
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.legend(loc = 'upper left')
plt.plot([10.5, 13.5], [10.5, 13.5], c = 'green')
plt.show()
The results are okay. It provides better effectiveness metrics than the regression tree. However, it is absolutely the worst model in terms of time efficiency.
results_ann = pd.DataFrame({'NeuralNet':{'RMSE CV Train': rmse_cv_tr, 'RMSE CV Test': rmse_cv_tst, 'MAE CV Train': mae_cv_tr, 'MAE CV Test': mae_cv_tst, 'AdjR2 CV Train': r2adj_cv_tr, 'AdjR2 CV Test': r2adj_cv_tst, 'Time':time}})
results_ann
| NeuralNet | |
|---|---|
| AdjR2 CV Test | 0.82149 |
| AdjR2 CV Train | 0.86703 |
| MAE CV Test | 0.10753 |
| MAE CV Train | 0.09869 |
| RMSE CV Test | 0.14482 |
| RMSE CV Train | 0.13324 |
| Time | 12.5 s +- 873 ms per loop (mean +- std. dev. o... |
result = pd.concat([result, results_ann], axis=1)
Here then are the final results for all models at the particular k level set earlier.
result
| Linear Regression | Lasso Regression | Decision Tree | Random Forest | XGBoost | NeuralNet | |
|---|---|---|---|---|---|---|
| AdjR2 CV Test | 0.90451 | 0.90508 | 0.75966 | 0.87241 | 0.89248 | 0.82149 |
| AdjR2 CV Train | 0.90538 | 0.90540 | 0.82707 | 0.89348 | 0.90865 | 0.86703 |
| MAE CV Test | 0.08071 | 0.08035 | 0.12676 | 0.08855 | 0.08368 | 0.10753 |
| MAE CV Train | 0.08315 | 0.08304 | 0.11246 | 0.08594 | 0.08027 | 0.09869 |
| RMSE CV Test | 0.10596 | 0.10559 | 0.16945 | 0.12412 | 0.11206 | 0.14482 |
| RMSE CV Train | 0.11384 | 0.11382 | 0.15489 | 0.12111 | 0.11170 | 0.13324 |
| Time | 1.57 ms +- 18.7 us per loop (mean +- std. dev.... | 24.8 ms +- 1.58 ms per loop (mean +- std. dev.... | 7.93 ms +- 98.4 us per loop (mean +- std. dev.... | 242 ms +- 8.09 ms per loop (mean +- std. dev. ... | 113 ms +- 3.37 ms per loop (mean +- std. dev. ... | 12.5 s +- 873 ms per loop (mean +- std. dev. o... |
Lastly, I export the results to an excel file which is handy for creating tables for PowerPoint and Word.
# result.to_excel(f'Results_K{cV}.xlsx')